[llvm] AMDGPU: Fix creating minimum3/maximum3 nodes pre-gfx12 (PR #93027)

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Wed May 22 06:03:16 PDT 2024


https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/93027

These would fail to select.


>From 6d443aaba3a9296d90ca04090cdd734dd500b02b Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 22 May 2024 13:48:22 +0200
Subject: [PATCH 1/2] AMDGPU: Clean up maximum3/minimum3 tests

These were using patterns copied from older tests, before non-kernel functions
were supported and manually written checks. Also stop using -flat-for-global,
which only exists to try to share tests between SI/CI and VI+.

This was also missing test coverage, we're incorrectly forming maximum3/minimum3
pre-gfx12. This is a pre-commit before fixing that.
---
 llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 1411 +++++++++++++++++++++++--
 llvm/test/CodeGen/AMDGPU/fminimum3.ll | 1411 +++++++++++++++++++++++--
 2 files changed, 2636 insertions(+), 186 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 6e45084dc4b80..fe6de63742b76 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -1,98 +1,1323 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_0_f32:
-; GCN: buffer_load_b32 [[REGC:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGB:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGA:v[0-9]+]]
-; GCN: v_maximum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b32 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile float, ptr addrspace(1) %aptr, align 4
-  %b = load volatile float, ptr addrspace(1) %bptr, align 4
-  %c = load volatile float, ptr addrspace(1) %cptr, align 4
-  %f0 = call float @llvm.maximum.f32(float %a, float %b)
-  %f1 = call float @llvm.maximum.f32(float %f0, float %c)
-  store float %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-; Commute operand of second fmaximum
-; GCN-LABEL: {{^}}test_fmaximum3_olt_1_f32:
-; GCN: buffer_load_b32 [[REGB:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGA:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGC:v[0-9]+]]
-; GCN: v_maximum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b32 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile float, ptr addrspace(1) %aptr, align 4
-  %b = load volatile float, ptr addrspace(1) %bptr, align 4
-  %c = load volatile float, ptr addrspace(1) %cptr, align 4
-  %f0 = call float @llvm.maximum.f32(float %a, float %b)
-  %f1 = call float @llvm.maximum.f32(float %c, float %f0)
-  store float %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_0_f16:
-; GCN: buffer_load_u16 [[REGC:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGB:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGA:v[0-9]+]]
-; GCN: v_maximum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b16 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile half, ptr addrspace(1) %aptr, align 2
-  %b = load volatile half, ptr addrspace(1) %bptr, align 2
-  %c = load volatile half, ptr addrspace(1) %cptr, align 2
-  %f0 = call half @llvm.maximum.f16(half %a, half %b)
-  %f1 = call half @llvm.maximum.f16(half %f0, half %c)
-  store half %f1, ptr addrspace(1) %out, align 2
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_1_f16:
-; GCN: buffer_load_u16 [[REGA:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGB:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGC:v[0-9]+]]
-; GCN: v_maximum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]]
-; GCN: buffer_store_b16 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile half, ptr addrspace(1) %aptr, align 2
-  %b = load volatile half, ptr addrspace(1) %bptr, align 2
-  %c = load volatile half, ptr addrspace(1) %cptr, align 2
-  %f0 = call half @llvm.maximum.f16(half %a, half %b)
-  %f1 = call half @llvm.maximum.f16(half %c, half %f0)
-  store half %f1, ptr addrspace(1) %out, align 2
-  ret void
-}
-
-; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of maximum3
-; since there are no pack instructions for fmaximum3.
-; GCN-LABEL: {{^}}no_fmaximum3_v2f16:
-; GCN: v_pk_maximum_f16 v0, v0, v1
-; GCN: v_pk_maximum_f16 v0, v2, v0
-; GCN: v_pk_maximum_f16 v0, v0, v3
-; GCN-NEXT: s_setpc_b64
-define <2 x half> @no_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
-entry:
-  %max = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
-  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max)
-  %res = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max1, <2 x half> %d)
-  ret <2 x half> %res
-}
-
-; GCN-LABEL: {{^}}no_fmaximum3_olt_0_f64:
-; GCN-COUNT-2: v_maximum_f64
-define amdgpu_kernel void @no_fmaximum3_olt_0_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile double, ptr addrspace(1) %aptr, align 4
-  %b = load volatile double, ptr addrspace(1) %bptr, align 4
-  %c = load volatile double, ptr addrspace(1) %cptr, align 4
-  %f0 = call double @llvm.maximum.f64(double %a, double %b)
-  %f1 = call double @llvm.maximum.f64(double %f0, double %c)
-  store double %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -enable-var-scope -check-prefix=GFX12 %s
+
+declare half @llvm.fabs.f16(half)
+declare float @llvm.fabs.f32(float)
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>)
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
 
 declare double @llvm.maximum.f64(double, double)
 declare float @llvm.maximum.f32(float, float)
 declare half @llvm.maximum.f16(half, half)
 declare <2 x half> @llvm.maximum.v2f16(<2 x half>, <2 x half>)
+declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
+declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
+
+define float @v_fmaximum3_f32(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_commute(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v2, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %c, float %max0)
+  ret float %max1
+}
+
+define amdgpu_ps i32 @s_fmaximum3_f32(float inreg %a, float inreg %b, float inreg %c) {
+; GFX12-LABEL: s_fmaximum3_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum3_f32 v0, s0, s1, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    ; return to shader part epilog
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  %cast = bitcast float %max1 to i32
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+  ret i32 %readfirstlane
+}
+
+define float @v_fmaximum3_f32_fabs0(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %max0 = call float @llvm.maximum.f32(float %a.fabs, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs1(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, |v1|, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %max0 = call float @llvm.maximum.f32(float %a, float %b.fabs)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs2(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fabs)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, |v1|, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %max0 = call float @llvm.maximum.f32(float %a.fabs, float %b.fabs)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fabs)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -v0, -v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg float %a
+  %b.fneg = fneg float %b
+  %c.fneg = fneg float %c
+  %max0 = call float @llvm.maximum.f32(float %a.fneg, float %b.fneg)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -|v0|, -|v1|, -|v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %a.fneg.fabs = fneg float %a.fabs
+  %b.fneg.fabs = fneg float %b.fabs
+  %c.fneg.fabs = fneg float %c.fabs
+  %max0 = call float @llvm.maximum.f32(float %a.fneg.fabs, float %b.fneg.fabs)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg.fabs)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg0(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg float %a
+  %max0 = call float @llvm.maximum.f32(float %a.fneg, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg1(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, -v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %b.fneg = fneg float %b
+  %max0 = call float @llvm.maximum.f32(float %a, float %b.fneg)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg2(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c.fneg = fneg float %c
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_const0(float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_const0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, 0x41000000, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float 8.0, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32__const2(float %a, float %b) {
+; GFX12-LABEL: v_fmaximum3_f32__const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, 0x41000000
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float 8.0)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_inlineimm0(float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_inlineimm0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, 4.0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float 4.0, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32__inlineimm(float %a, float %b) {
+; GFX12-LABEL: v_fmaximum3_f32__inlineimm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float 4.0)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_const1_const2(float %a) {
+; GFX12-LABEL: v_fmaximum3_f32_const1_const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s0, 0x41000000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, s0, 0x41800000
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float 8.0)
+  %max1 = call float @llvm.maximum.f32(float %max0, float 16.0)
+  ret float %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v4, v0, v2
+; GFX12-NEXT:    v_maximum3_f32 v1, v5, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %c, <2 x float> %max0)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v2, v4
+; GFX12-NEXT:    v_maximum3_f32 v1, v1, v3, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, |v2|, |v4|
+; GFX12-NEXT:    v_maximum3_f32 v1, |v1|, |v3|, |v5|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %b.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %b)
+  %c.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %c)
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a.fabs, <2 x float> %b.fabs)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c.fabs)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -v0, -v2, -v4
+; GFX12-NEXT:    v_maximum3_f32 v1, -v1, -v3, -v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <2 x float> %a
+  %b.fneg = fneg <2 x float> %b
+  %c.fneg = fneg <2 x float> %c
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a.fneg, <2 x float> %b.fneg)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c.fneg)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, 2.0, v2
+; GFX12-NEXT:    v_maximum3_f32 v1, v1, 2.0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> <float 2.0, float 2.0>)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v4, 4.0
+; GFX12-NEXT:    v_maximum_f32 v1, v5, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %c, <2 x float> <float 4.0, float 4.0>)
+  ret <2 x float> %max1
+}
+
+define <3 x float> @v_fmaximum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v6, v0, v3
+; GFX12-NEXT:    v_maximum3_f32 v1, v7, v1, v4
+; GFX12-NEXT:    v_maximum3_f32 v2, v8, v2, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
+  %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %c, <3 x float> %max0)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fmaximum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v3, v6
+; GFX12-NEXT:    v_maximum3_f32 v1, v1, v4, v7
+; GFX12-NEXT:    v_maximum3_f32 v2, v2, v5, v8
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
+  %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fmaximum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f32__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, |v3|, |v6|
+; GFX12-NEXT:    v_maximum3_f32 v1, |v1|, |v4|, |v7|
+; GFX12-NEXT:    v_maximum3_f32 v2, |v2|, |v5|, |v8|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a)
+  %b.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %b)
+  %c.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %c)
+  %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a.fabs, <3 x float> %b.fabs)
+  %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c.fabs)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fmaximum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f32__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -v0, -v3, -v6
+; GFX12-NEXT:    v_maximum3_f32 v1, -v1, -v4, -v7
+; GFX12-NEXT:    v_maximum3_f32 v2, -v2, -v5, -v8
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <3 x float> %a
+  %b.fneg = fneg <3 x float> %b
+  %c.fneg = fneg <3 x float> %c
+  %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a.fneg, <3 x float> %b.fneg)
+  %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c.fneg)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fmaximum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f32__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, 2.0, v3
+; GFX12-NEXT:    v_maximum3_f32 v1, v1, 2.0, v4
+; GFX12-NEXT:    v_maximum3_f32 v2, v2, 2.0, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> <float 2.0, float 2.0, float 2.0>)
+  %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fmaximum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f32__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v6, 4.0
+; GFX12-NEXT:    v_maximum_f32 v1, v7, 4.0
+; GFX12-NEXT:    v_maximum_f32 v2, v8, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
+  %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %c, <3 x float> <float 4.0, float 4.0, float 4.0>)
+  ret <3 x float> %max1
+}
+
+
+define half @v_fmaximum3_f16(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v2, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %c, half %max0)
+  ret half %max1
+}
+
+define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg %c) {
+; GFX12-LABEL: s_fmaximum3_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum3_f16 v0, s0, s1, v0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    ; return to shader part epilog
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  %cast = bitcast half %max1 to i16
+  %zext = zext i16 %cast to i32
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readfirstlane
+}
+
+define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fabs0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, |v0|, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call half @llvm.fabs.f16(half %a)
+  %max0 = call half @llvm.maximum.f16(half %a.fabs, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fabs1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, |v1|, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %b.fabs = call half @llvm.fabs.f16(half %b)
+  %max0 = call half @llvm.maximum.f16(half %a, half %b.fabs)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fabs2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c.fabs = call half @llvm.fabs.f16(half %c)
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c.fabs)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, |v0|, |v1|, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call half @llvm.fabs.f16(half %a)
+  %b.fabs = call half @llvm.fabs.f16(half %b)
+  %c.fabs = call half @llvm.fabs.f16(half %c)
+  %max0 = call half @llvm.maximum.f16(half %a.fabs, half %b.fabs)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c.fabs)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, -v0, -v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg half %a
+  %b.fneg = fneg half %b
+  %c.fneg = fneg half %c
+  %max0 = call half @llvm.maximum.f16(half %a.fneg, half %b.fneg)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c.fneg)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fneg_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, -|v0|, -|v1|, -|v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call half @llvm.fabs.f16(half %a)
+  %b.fabs = call half @llvm.fabs.f16(half %b)
+  %c.fabs = call half @llvm.fabs.f16(half %c)
+  %a.fneg.fabs = fneg half %a.fabs
+  %b.fneg.fabs = fneg half %b.fabs
+  %c.fneg.fabs = fneg half %c.fabs
+  %max0 = call half @llvm.maximum.f16(half %a.fneg.fabs, half %b.fneg.fabs)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c.fneg.fabs)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fneg0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, -v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg half %a
+  %max0 = call half @llvm.maximum.f16(half %a.fneg, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fneg1(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fneg1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, -v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %b.fneg = fneg half %b
+  %max0 = call half @llvm.maximum.f16(half %a, half %b.fneg)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fneg2(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fneg2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c.fneg = fneg half %c
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c.fneg)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_const0(half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_const0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, 0x4800, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half 8.0, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16__const2(half %a, half %b) {
+; GFX12-LABEL: v_fmaximum3_f16__const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, 0x4800
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half 8.0)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_inlineimm0(half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_inlineimm0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, 4.0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half 4.0, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16__inlineimm(half %a, half %b) {
+; GFX12-LABEL: v_fmaximum3_f16__inlineimm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half 4.0)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_const1_const2(half %a) {
+; GFX12-LABEL: v_fmaximum3_f16_const1_const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_movk_i32 s0, 0x4800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, s0, 0x4c00
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half %a, half 8.0)
+  %max1 = call half @llvm.maximum.f16(half %max0, half 16.0)
+  ret half %max1
+}
+
+define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v2, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max0)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f16__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
+; GFX12-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
+  %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b)
+  %c.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %c)
+  %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a.fabs, <2 x half> %b.fabs)
+  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c.fabs)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f16__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <2 x half> %a
+  %b.fneg = fneg <2 x half> %b
+  %c.fneg = fneg <2 x half> %c
+  %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a.fneg, <2 x half> %b.fneg)
+  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c.fneg)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fmaximum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f16__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 2.0>)
+  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fmaximum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f16__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v2, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> <half 4.0, half 4.0>)
+  ret <2 x half> %max1
+}
+
+define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v4, v0
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v5, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
+  %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %c, <3 x half> %max0)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
+  %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fmaximum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f16__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
+; GFX12-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX12-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v5
+; GFX12-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a)
+  %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b)
+  %c.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %c)
+  %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a.fabs, <3 x half> %b.fabs)
+  %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c.fabs)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f16__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <3 x half> %a
+  %b.fneg = fneg <3 x half> %b
+  %c.fneg = fneg <3 x half> %c
+  %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a.fneg, <3 x half> %b.fneg)
+  %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c.fneg)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f16__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, 2.0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> <half 2.0, half 2.0, half 2.0>)
+  %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f16__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v4, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v5, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
+  %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %c, <3 x half> <half 4.0, half 4.0, half 4.0>)
+  ret <3 x half> %max1
+}
+
+define double @v_fmaximum3_f64(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_commute(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[4:5], v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %c, double %max0)
+  ret double %max1
+}
+
+define amdgpu_ps <2 x i32> @s_fmaximum3_f64(double inreg %a, double inreg %b, double inreg %c) {
+; GFX12-LABEL: s_fmaximum3_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], s[4:5]
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  %cast = bitcast double %max1 to <2 x i32>
+  %elt0 = extractelement <2 x i32> %cast, i32 0
+  %elt1 = extractelement <2 x i32> %cast, i32 1
+  %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
+  %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
+  %insert.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
+  %insert.1 = insertelement <2 x i32> %insert.0, i32 %readlane1, i32 1
+  ret <2 x i32> %insert.1
+}
+
+define double @v_fmaximum3_f64_fabs0(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fabs0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], |v[0:1]|, v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call double @llvm.fabs.f64(double %a)
+  %max0 = call double @llvm.maximum.f64(double %a.fabs, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fabs1(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fabs1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], |v[2:3]|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %b.fabs = call double @llvm.fabs.f64(double %b)
+  %max0 = call double @llvm.maximum.f64(double %a, double %b.fabs)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fabs2(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fabs2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], |v[4:5]|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c.fabs = call double @llvm.fabs.f64(double %c)
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c.fabs)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fabs_all(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], |v[0:1]|, |v[2:3]|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], |v[4:5]|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call double @llvm.fabs.f64(double %a)
+  %b.fabs = call double @llvm.fabs.f64(double %b)
+  %c.fabs = call double @llvm.fabs.f64(double %c)
+  %max0 = call double @llvm.maximum.f64(double %a.fabs, double %b.fabs)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c.fabs)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fneg_all(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], -v[0:1], -v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], -v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg double %a
+  %b.fneg = fneg double %b
+  %c.fneg = fneg double %c
+  %max0 = call double @llvm.maximum.f64(double %a.fneg, double %b.fneg)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c.fneg)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fneg_fabs_all(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fneg_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], -|v[0:1]|, -|v[2:3]|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], -|v[4:5]|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call double @llvm.fabs.f64(double %a)
+  %b.fabs = call double @llvm.fabs.f64(double %b)
+  %c.fabs = call double @llvm.fabs.f64(double %c)
+  %a.fneg.fabs = fneg double %a.fabs
+  %b.fneg.fabs = fneg double %b.fabs
+  %c.fneg.fabs = fneg double %c.fabs
+  %max0 = call double @llvm.maximum.f64(double %a.fneg.fabs, double %b.fneg.fabs)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c.fneg.fabs)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fneg0(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fneg0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], -v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg double %a
+  %max0 = call double @llvm.maximum.f64(double %a.fneg, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fneg1(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fneg1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], -v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %b.fneg = fneg double %b
+  %max0 = call double @llvm.maximum.f64(double %a, double %b.fneg)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fneg2(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fneg2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], -v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c.fneg = fneg double %c
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c.fneg)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_const0(double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_const0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], 0x40200000, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double 8.0, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64__const2(double %a, double %b) {
+; GFX12-LABEL: v_fmaximum3_f64__const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], 0x40200000, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double 8.0)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_inlineimm0(double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_inlineimm0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], 4.0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double 4.0, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64__inlineimm(double %a, double %b) {
+; GFX12-LABEL: v_fmaximum3_f64__inlineimm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double 4.0)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_const1_const2(double %a) {
+; GFX12-LABEL: v_fmaximum3_f64_const1_const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], 0x40200000, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], 0x40300000, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double %a, double 8.0)
+  %max1 = call double @llvm.maximum.f64(double %max0, double 16.0)
+  ret double %max1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index eef271e69a384..baa6b0d7b8a19 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -1,98 +1,1323 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
-
-; GCN-LABEL: {{^}}test_fminimum3_olt_0_f32:
-; GCN: buffer_load_b32 [[REGC:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGB:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGA:v[0-9]+]]
-; GCN: v_minimum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b32 [[RESULT]],
-define amdgpu_kernel void @test_fminimum3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile float, ptr addrspace(1) %aptr, align 4
-  %b = load volatile float, ptr addrspace(1) %bptr, align 4
-  %c = load volatile float, ptr addrspace(1) %cptr, align 4
-  %f0 = call float @llvm.minimum.f32(float %a, float %b)
-  %f1 = call float @llvm.minimum.f32(float %f0, float %c)
-  store float %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-; Commute operand of second fminimum
-; GCN-LABEL: {{^}}test_fminimum3_olt_1_f32:
-; GCN: buffer_load_b32 [[REGB:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGA:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGC:v[0-9]+]]
-; GCN: v_minimum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b32 [[RESULT]],
-define amdgpu_kernel void @test_fminimum3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile float, ptr addrspace(1) %aptr, align 4
-  %b = load volatile float, ptr addrspace(1) %bptr, align 4
-  %c = load volatile float, ptr addrspace(1) %cptr, align 4
-  %f0 = call float @llvm.minimum.f32(float %a, float %b)
-  %f1 = call float @llvm.minimum.f32(float %c, float %f0)
-  store float %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_fminimum3_olt_0_f16:
-; GCN: buffer_load_u16 [[REGC:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGB:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGA:v[0-9]+]]
-; GCN: v_minimum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b16 [[RESULT]],
-define amdgpu_kernel void @test_fminimum3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile half, ptr addrspace(1) %aptr, align 2
-  %b = load volatile half, ptr addrspace(1) %bptr, align 2
-  %c = load volatile half, ptr addrspace(1) %cptr, align 2
-  %f0 = call half @llvm.minimum.f16(half %a, half %b)
-  %f1 = call half @llvm.minimum.f16(half %f0, half %c)
-  store half %f1, ptr addrspace(1) %out, align 2
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_fminimum3_olt_1_f16:
-; GCN: buffer_load_u16 [[REGA:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGB:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGC:v[0-9]+]]
-; GCN: v_minimum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]]
-; GCN: buffer_store_b16 [[RESULT]],
-define amdgpu_kernel void @test_fminimum3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile half, ptr addrspace(1) %aptr, align 2
-  %b = load volatile half, ptr addrspace(1) %bptr, align 2
-  %c = load volatile half, ptr addrspace(1) %cptr, align 2
-  %f0 = call half @llvm.minimum.f16(half %a, half %b)
-  %f1 = call half @llvm.minimum.f16(half %c, half %f0)
-  store half %f1, ptr addrspace(1) %out, align 2
-  ret void
-}
-
-; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of minimum3
-; since there are no pack instructions for fminimum3.
-; GCN-LABEL: {{^}}no_fminimum3_v2f16:
-; GCN: v_pk_minimum_f16 v0, v0, v1
-; GCN: v_pk_minimum_f16 v0, v2, v0
-; GCN: v_pk_minimum_f16 v0, v0, v3
-; GCN-NEXT: s_setpc_b64
-define <2 x half> @no_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
-entry:
-  %min = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
-  %min1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %c, <2 x half> %min)
-  %res = call <2 x half> @llvm.minimum.v2f16(<2 x half> %min1, <2 x half> %d)
-  ret <2 x half> %res
-}
-
-; GCN-LABEL: {{^}}no_fminimum3_olt_0_f64:
-; GCN-COUNT-2: v_minimum_f64
-define amdgpu_kernel void @no_fminimum3_olt_0_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile double, ptr addrspace(1) %aptr, align 4
-  %b = load volatile double, ptr addrspace(1) %bptr, align 4
-  %c = load volatile double, ptr addrspace(1) %cptr, align 4
-  %f0 = call double @llvm.minimum.f64(double %a, double %b)
-  %f1 = call double @llvm.minimum.f64(double %f0, double %c)
-  store double %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -enable-var-scope -check-prefix=GFX12 %s
+
+declare half @llvm.fabs.f16(half)
+declare float @llvm.fabs.f32(float)
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>)
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
 
 declare double @llvm.minimum.f64(double, double)
 declare float @llvm.minimum.f32(float, float)
 declare half @llvm.minimum.f16(half, half)
 declare <2 x half> @llvm.minimum.v2f16(<2 x half>, <2 x half>)
+declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>)
+declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
+
+define float @v_fminimum3_f32(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_commute(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v2, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %c, float %max0)
+  ret float %max1
+}
+
+define amdgpu_ps i32 @s_fminimum3_f32(float inreg %a, float inreg %b, float inreg %c) {
+; GFX12-LABEL: s_fminimum3_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum3_f32 v0, s0, s1, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    ; return to shader part epilog
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  %cast = bitcast float %max1 to i32
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+  ret i32 %readfirstlane
+}
+
+define float @v_fminimum3_f32_fabs0(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fabs0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, |v0|, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %max0 = call float @llvm.minimum.f32(float %a.fabs, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fabs1(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fabs1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, |v1|, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %max0 = call float @llvm.minimum.f32(float %a, float %b.fabs)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fabs2(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fabs2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c.fabs)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fabs_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, |v0|, |v1|, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %max0 = call float @llvm.minimum.f32(float %a.fabs, float %b.fabs)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c.fabs)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fneg_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, -v0, -v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg float %a
+  %b.fneg = fneg float %b
+  %c.fneg = fneg float %c
+  %max0 = call float @llvm.minimum.f32(float %a.fneg, float %b.fneg)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c.fneg)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fneg_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, -|v0|, -|v1|, -|v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %a.fneg.fabs = fneg float %a.fabs
+  %b.fneg.fabs = fneg float %b.fabs
+  %c.fneg.fabs = fneg float %c.fabs
+  %max0 = call float @llvm.minimum.f32(float %a.fneg.fabs, float %b.fneg.fabs)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c.fneg.fabs)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fneg0(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fneg0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, -v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg float %a
+  %max0 = call float @llvm.minimum.f32(float %a.fneg, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fneg1(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fneg1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, -v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %b.fneg = fneg float %b
+  %max0 = call float @llvm.minimum.f32(float %a, float %b.fneg)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fneg2(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fneg2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c.fneg = fneg float %c
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c.fneg)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_const0(float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_const0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, 0x41000000, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float 8.0, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32__const2(float %a, float %b) {
+; GFX12-LABEL: v_fminimum3_f32__const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, 0x41000000
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float 8.0)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_inlineimm0(float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_inlineimm0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, 4.0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float 4.0, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32__inlineimm(float %a, float %b) {
+; GFX12-LABEL: v_fminimum3_f32__inlineimm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float 4.0)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_const1_const2(float %a) {
+; GFX12-LABEL: v_fminimum3_f32_const1_const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s0, 0x41000000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, s0, 0x41800000
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float %a, float 8.0)
+  %max1 = call float @llvm.minimum.f32(float %max0, float 16.0)
+  ret float %max1
+}
+
+define <2 x float> @v_fminimum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v4, v0, v2
+; GFX12-NEXT:    v_minimum3_f32 v1, v5, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %c, <2 x float> %max0)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fminimum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v2f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v2, v4
+; GFX12-NEXT:    v_minimum3_f32 v1, v1, v3, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fminimum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v2f32__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, |v0|, |v2|, |v4|
+; GFX12-NEXT:    v_minimum3_f32 v1, |v1|, |v3|, |v5|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %b.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %b)
+  %c.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %c)
+  %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a.fabs, <2 x float> %b.fabs)
+  %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c.fabs)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fminimum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v2f32__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, -v0, -v2, -v4
+; GFX12-NEXT:    v_minimum3_f32 v1, -v1, -v3, -v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <2 x float> %a
+  %b.fneg = fneg <2 x float> %b
+  %c.fneg = fneg <2 x float> %c
+  %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a.fneg, <2 x float> %b.fneg)
+  %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c.fneg)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fminimum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v2f32__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, 2.0, v2
+; GFX12-NEXT:    v_minimum3_f32 v1, v1, 2.0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> <float 2.0, float 2.0>)
+  %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fminimum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v2f32__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v4, 4.0
+; GFX12-NEXT:    v_minimum_f32 v1, v5, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %c, <2 x float> <float 4.0, float 4.0>)
+  ret <2 x float> %max1
+}
+
+define <3 x float> @v_fminimum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v3f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v6, v0, v3
+; GFX12-NEXT:    v_minimum3_f32 v1, v7, v1, v4
+; GFX12-NEXT:    v_minimum3_f32 v2, v8, v2, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
+  %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %c, <3 x float> %max0)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fminimum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v3f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v3, v6
+; GFX12-NEXT:    v_minimum3_f32 v1, v1, v4, v7
+; GFX12-NEXT:    v_minimum3_f32 v2, v2, v5, v8
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
+  %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fminimum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v3f32__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, |v0|, |v3|, |v6|
+; GFX12-NEXT:    v_minimum3_f32 v1, |v1|, |v4|, |v7|
+; GFX12-NEXT:    v_minimum3_f32 v2, |v2|, |v5|, |v8|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a)
+  %b.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %b)
+  %c.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %c)
+  %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a.fabs, <3 x float> %b.fabs)
+  %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c.fabs)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fminimum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v3f32__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, -v0, -v3, -v6
+; GFX12-NEXT:    v_minimum3_f32 v1, -v1, -v4, -v7
+; GFX12-NEXT:    v_minimum3_f32 v2, -v2, -v5, -v8
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <3 x float> %a
+  %b.fneg = fneg <3 x float> %b
+  %c.fneg = fneg <3 x float> %c
+  %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a.fneg, <3 x float> %b.fneg)
+  %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c.fneg)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fminimum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v3f32__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, 2.0, v3
+; GFX12-NEXT:    v_minimum3_f32 v1, v1, 2.0, v4
+; GFX12-NEXT:    v_minimum3_f32 v2, v2, 2.0, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> <float 2.0, float 2.0, float 2.0>)
+  %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fminimum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v3f32__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v0, v6, 4.0
+; GFX12-NEXT:    v_minimum_f32 v1, v7, 4.0
+; GFX12-NEXT:    v_minimum_f32 v2, v8, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
+  %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %c, <3 x float> <float 4.0, float 4.0, float 4.0>)
+  ret <3 x float> %max1
+}
+
+
+define half @v_fminimum3_f16(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_commute(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v2, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %c, half %max0)
+  ret half %max1
+}
+
+define amdgpu_ps i32 @s_fminimum3_f16(half inreg %a, half inreg %b, half inreg %c) {
+; GFX12-LABEL: s_fminimum3_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum3_f16 v0, s0, s1, v0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    ; return to shader part epilog
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  %cast = bitcast half %max1 to i16
+  %zext = zext i16 %cast to i32
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readfirstlane
+}
+
+define half @v_fminimum3_f16_fabs0(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fabs0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, |v0|, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call half @llvm.fabs.f16(half %a)
+  %max0 = call half @llvm.minimum.f16(half %a.fabs, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fabs1(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fabs1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, |v1|, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %b.fabs = call half @llvm.fabs.f16(half %b)
+  %max0 = call half @llvm.minimum.f16(half %a, half %b.fabs)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fabs2(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fabs2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c.fabs = call half @llvm.fabs.f16(half %c)
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c.fabs)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fabs_all(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, |v0|, |v1|, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call half @llvm.fabs.f16(half %a)
+  %b.fabs = call half @llvm.fabs.f16(half %b)
+  %c.fabs = call half @llvm.fabs.f16(half %c)
+  %max0 = call half @llvm.minimum.f16(half %a.fabs, half %b.fabs)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c.fabs)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fneg_all(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, -v0, -v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg half %a
+  %b.fneg = fneg half %b
+  %c.fneg = fneg half %c
+  %max0 = call half @llvm.minimum.f16(half %a.fneg, half %b.fneg)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c.fneg)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fneg_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, -|v0|, -|v1|, -|v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call half @llvm.fabs.f16(half %a)
+  %b.fabs = call half @llvm.fabs.f16(half %b)
+  %c.fabs = call half @llvm.fabs.f16(half %c)
+  %a.fneg.fabs = fneg half %a.fabs
+  %b.fneg.fabs = fneg half %b.fabs
+  %c.fneg.fabs = fneg half %c.fabs
+  %max0 = call half @llvm.minimum.f16(half %a.fneg.fabs, half %b.fneg.fabs)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c.fneg.fabs)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fneg0(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fneg0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, -v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg half %a
+  %max0 = call half @llvm.minimum.f16(half %a.fneg, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fneg1(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fneg1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, -v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %b.fneg = fneg half %b
+  %max0 = call half @llvm.minimum.f16(half %a, half %b.fneg)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fneg2(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fneg2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c.fneg = fneg half %c
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c.fneg)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_const0(half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_const0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, 0x4800, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half 8.0, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16__const2(half %a, half %b) {
+; GFX12-LABEL: v_fminimum3_f16__const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, 0x4800
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half 8.0)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_inlineimm0(half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_inlineimm0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, 4.0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half 4.0, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16__inlineimm(half %a, half %b) {
+; GFX12-LABEL: v_fminimum3_f16__inlineimm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half 4.0)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_const1_const2(half %a) {
+; GFX12-LABEL: v_fminimum3_f16_const1_const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_movk_i32 s0, 0x4800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, s0, 0x4c00
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half %a, half 8.0)
+  %max1 = call half @llvm.minimum.f16(half %max0, half 16.0)
+  ret half %max1
+}
+
+define <2 x half> @v_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v2, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %c, <2 x half> %max0)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fminimum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v2f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fminimum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v2f16__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
+; GFX12-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
+  %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b)
+  %c.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %c)
+  %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a.fabs, <2 x half> %b.fabs)
+  %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c.fabs)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fminimum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v2f16__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <2 x half> %a
+  %b.fneg = fneg <2 x half> %b
+  %c.fneg = fneg <2 x half> %c
+  %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a.fneg, <2 x half> %b.fneg)
+  %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c.fneg)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fminimum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v2f16__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 2.0>)
+  %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fminimum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v2f16__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v2, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %c, <2 x half> <half 4.0, half 4.0>)
+  ret <2 x half> %max1
+}
+
+define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v3f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v4, v0
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v5, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
+  %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %c, <3 x half> %max0)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v3f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
+  %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fminimum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v3f16__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
+; GFX12-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX12-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v5
+; GFX12-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a)
+  %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b)
+  %c.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %c)
+  %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a.fabs, <3 x half> %b.fabs)
+  %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c.fabs)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v3f16__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <3 x half> %a
+  %b.fneg = fneg <3 x half> %b
+  %c.fneg = fneg <3 x half> %c
+  %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a.fneg, <3 x half> %b.fneg)
+  %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c.fneg)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v3f16__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, 2.0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> <half 2.0, half 2.0, half 2.0>)
+  %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v3f16__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v4, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v5, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
+  %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %c, <3 x half> <half 4.0, half 4.0, half 4.0>)
+  ret <3 x half> %max1
+}
+
+define double @v_fminimum3_f64(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_commute(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[4:5], v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %c, double %max0)
+  ret double %max1
+}
+
+define amdgpu_ps <2 x i32> @s_fminimum3_f64(double inreg %a, double inreg %b, double inreg %c) {
+; GFX12-LABEL: s_fminimum3_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], s[4:5]
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  %cast = bitcast double %max1 to <2 x i32>
+  %elt0 = extractelement <2 x i32> %cast, i32 0
+  %elt1 = extractelement <2 x i32> %cast, i32 1
+  %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
+  %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
+  %insert.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
+  %insert.1 = insertelement <2 x i32> %insert.0, i32 %readlane1, i32 1
+  ret <2 x i32> %insert.1
+}
+
+define double @v_fminimum3_f64_fabs0(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fabs0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], |v[0:1]|, v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call double @llvm.fabs.f64(double %a)
+  %max0 = call double @llvm.minimum.f64(double %a.fabs, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fabs1(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fabs1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], |v[2:3]|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %b.fabs = call double @llvm.fabs.f64(double %b)
+  %max0 = call double @llvm.minimum.f64(double %a, double %b.fabs)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fabs2(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fabs2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], |v[4:5]|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c.fabs = call double @llvm.fabs.f64(double %c)
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c.fabs)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fabs_all(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], |v[0:1]|, |v[2:3]|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], |v[4:5]|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call double @llvm.fabs.f64(double %a)
+  %b.fabs = call double @llvm.fabs.f64(double %b)
+  %c.fabs = call double @llvm.fabs.f64(double %c)
+  %max0 = call double @llvm.minimum.f64(double %a.fabs, double %b.fabs)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c.fabs)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fneg_all(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], -v[0:1], -v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], -v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg double %a
+  %b.fneg = fneg double %b
+  %c.fneg = fneg double %c
+  %max0 = call double @llvm.minimum.f64(double %a.fneg, double %b.fneg)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c.fneg)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fneg_fabs_all(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fneg_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], -|v[0:1]|, -|v[2:3]|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], -|v[4:5]|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call double @llvm.fabs.f64(double %a)
+  %b.fabs = call double @llvm.fabs.f64(double %b)
+  %c.fabs = call double @llvm.fabs.f64(double %c)
+  %a.fneg.fabs = fneg double %a.fabs
+  %b.fneg.fabs = fneg double %b.fabs
+  %c.fneg.fabs = fneg double %c.fabs
+  %max0 = call double @llvm.minimum.f64(double %a.fneg.fabs, double %b.fneg.fabs)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c.fneg.fabs)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fneg0(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fneg0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], -v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg double %a
+  %max0 = call double @llvm.minimum.f64(double %a.fneg, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fneg1(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fneg1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], -v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %b.fneg = fneg double %b
+  %max0 = call double @llvm.minimum.f64(double %a, double %b.fneg)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fneg2(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fneg2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], -v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c.fneg = fneg double %c
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c.fneg)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_const0(double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_const0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], 0x40200000, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double 8.0, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64__const2(double %a, double %b) {
+; GFX12-LABEL: v_fminimum3_f64__const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], 0x40200000, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double 8.0)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_inlineimm0(double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_inlineimm0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], 4.0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double 4.0, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64__inlineimm(double %a, double %b) {
+; GFX12-LABEL: v_fminimum3_f64__inlineimm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double 4.0)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_const1_const2(double %a) {
+; GFX12-LABEL: v_fminimum3_f64_const1_const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], 0x40200000, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], 0x40300000, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double %a, double 8.0)
+  %max1 = call double @llvm.minimum.f64(double %max0, double 16.0)
+  ret double %max1
+}

>From 7c3ffb7d41c3a4ed7f7f146b84faba0e4a9e4a54 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 22 May 2024 14:54:32 +0200
Subject: [PATCH 2/2] AMDGPU: Fix creating minimum3/maximum3 nodes pre-gfx12

These would fail to select.
---
 llvm/lib/Target/AMDGPU/GCNSubtarget.h     |    3 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |   32 +-
 llvm/test/CodeGen/AMDGPU/fmaximum3.ll     | 1521 +++++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/fminimum3.ll     | 1521 +++++++++++++++++++++
 4 files changed, 3073 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index b7548671f2c54..db5b467f22389 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1312,6 +1312,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // \returns true if the target has IEEE fminimum/fmaximum instructions
   bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
 
+  // \returns true if the target has IEEE fminimum3/fmaximum3 instructions
+  bool hasIEEEMinMax3() const { return hasIEEEMinMax(); }
+
   // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
   bool hasRrWGMode() const { return getGeneration() >= GFX12; }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7fe6c2d0db8f5..1d2a5fff23568 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13199,6 +13199,33 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
   return SDValue();
 }
 
+/// \return true if the subtarget supports minimum3 and maximum3 with the given
+/// base min/max opcode \p Opc for type \p VT.
+static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
+                             EVT VT) {
+  switch (Opc) {
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
+  case AMDGPUISD::FMIN_LEGACY:
+  case AMDGPUISD::FMAX_LEGACY:
+    return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
+    return (VT == MVT::f32 || VT == MVT::f16) && Subtarget.hasIEEEMinMax3();
+  case ISD::SMAX:
+  case ISD::SMIN:
+  case ISD::UMAX:
+  case ISD::UMIN:
+    return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
+  default:
+    return false;
+  }
+
+  llvm_unreachable("not a min/max opcode");
+}
+
 SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -13211,10 +13238,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
   // Only do this if the inner op has one use since this will just increases
   // register pressure for no benefit.
 
-  if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
-      !VT.isVector() &&
-      (VT == MVT::i32 || VT == MVT::f32 ||
-       ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
+  if (supportsMin3Max3(*Subtarget, Opc, VT)) {
     // max(max(a, b), c) -> max3(a, b, c)
     // min(min(a, b), c) -> min3(a, b, c)
     if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index fe6de63742b76..3caebacb187a5 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -enable-var-scope -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
 
 declare half @llvm.fabs.f16(half)
 declare float @llvm.fabs.f32(float)
@@ -23,6 +24,18 @@ define float @v_fmaximum3_f32(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c)
   ret float %max1
@@ -38,6 +51,18 @@ define float @v_fmaximum3_f32_commute(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f32 v0, v2, v0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v2, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
   %max1 = call float @llvm.maximum.f32(float %c, float %max0)
   ret float %max1
@@ -51,6 +76,19 @@ define amdgpu_ps i32 @s_fmaximum3_f32(float inreg %a, float inreg %b, float inre
 ; GFX12-NEXT:    v_maximum3_f32 v0, s0, s1, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fmaximum3_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c)
   %cast = bitcast float %max1 to i32
@@ -68,6 +106,18 @@ define float @v_fmaximum3_f32_fabs0(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fabs0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, |v0|, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call float @llvm.fabs.f32(float %a)
   %max0 = call float @llvm.maximum.f32(float %a.fabs, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c)
@@ -84,6 +134,18 @@ define float @v_fmaximum3_f32_fabs1(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, |v1|, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fabs1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, v0, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b.fabs = call float @llvm.fabs.f32(float %b)
   %max0 = call float @llvm.maximum.f32(float %a, float %b.fabs)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c)
@@ -100,6 +162,18 @@ define float @v_fmaximum3_f32_fabs2(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, |v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fabs2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %c.fabs = call float @llvm.fabs.f32(float %c)
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c.fabs)
@@ -116,6 +190,18 @@ define float @v_fmaximum3_f32_fabs_all(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, |v1|, |v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, |v0|, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call float @llvm.fabs.f32(float %a)
   %b.fabs = call float @llvm.fabs.f32(float %b)
   %c.fabs = call float @llvm.fabs.f32(float %c)
@@ -134,6 +220,18 @@ define float @v_fmaximum3_f32_fneg_all(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f32 v0, -v0, -v1, -v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, -v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg float %a
   %b.fneg = fneg float %b
   %c.fneg = fneg float %c
@@ -152,6 +250,18 @@ define float @v_fmaximum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f32 v0, -|v0|, -|v1|, -|v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, -|v0|, -|v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -|v0|, -|v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e64 v1, v0, -|v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -|v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call float @llvm.fabs.f32(float %a)
   %b.fabs = call float @llvm.fabs.f32(float %b)
   %c.fabs = call float @llvm.fabs.f32(float %c)
@@ -173,6 +283,18 @@ define float @v_fmaximum3_f32_fneg0(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f32 v0, -v0, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, -v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg float %a
   %max0 = call float @llvm.maximum.f32(float %a.fneg, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c)
@@ -189,6 +311,18 @@ define float @v_fmaximum3_f32_fneg1(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, -v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b.fneg = fneg float %b
   %max0 = call float @llvm.maximum.f32(float %a, float %b.fneg)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c)
@@ -205,6 +339,18 @@ define float @v_fmaximum3_f32_fneg2(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, -v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %c.fneg = fneg float %c
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg)
@@ -221,6 +367,18 @@ define float @v_fmaximum3_f32_const0(float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, 0x41000000, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_const0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v2, 0x41000000, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float 8.0, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c)
   ret float %max1
@@ -236,6 +394,18 @@ define float @v_fmaximum3_f32__const2(float %a, float %b) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, 0x41000000
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32__const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, 0x41000000, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float 8.0)
   ret float %max1
@@ -251,6 +421,18 @@ define float @v_fmaximum3_f32_inlineimm0(float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, 4.0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_inlineimm0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v2, 4.0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float 4.0, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c)
   ret float %max1
@@ -266,6 +448,18 @@ define float @v_fmaximum3_f32__inlineimm(float %a, float %b) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32__inlineimm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, 4.0, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float 4.0)
   ret float %max1
@@ -283,6 +477,18 @@ define float @v_fmaximum3_f32_const1_const2(float %a) {
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, s0, 0x41800000
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_const1_const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v1, 0x41000000, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, 0x41800000, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float %a, float 8.0)
   %max1 = call float @llvm.maximum.f32(float %max0, float 16.0)
   ret float %max1
@@ -299,6 +505,52 @@ define <2 x float> @v_fmaximum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float
 ; GFX12-NEXT:    v_maximum3_f32 v0, v4, v0, v2
 ; GFX12-NEXT:    v_maximum3_f32 v1, v5, v1, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v5, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
   %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %c, <2 x float> %max0)
   ret <2 x float> %max1
@@ -315,6 +567,52 @@ define <2 x float> @v_fmaximum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, v2, v4
 ; GFX12-NEXT:    v_maximum3_f32 v1, v1, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
   %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c)
   ret <2 x float> %max1
@@ -331,6 +629,52 @@ define <2 x float> @v_fmaximum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b,
 ; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, |v2|, |v4|
 ; GFX12-NEXT:    v_maximum3_f32 v1, |v1|, |v3|, |v5|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f32__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v7, |v6|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v1|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v6, |v1|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v3|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v3|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, |v3|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v0|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, |v0|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v2|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v2|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, |v4|, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v4|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v4|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, |v5|, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v5|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v5|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
   %b.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %b)
   %c.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %c)
@@ -350,6 +694,52 @@ define <2 x float> @v_fmaximum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b,
 ; GFX12-NEXT:    v_maximum3_f32 v0, -v0, -v2, -v4
 ; GFX12-NEXT:    v_maximum3_f32 v1, -v1, -v3, -v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f32__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v7, -v6, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v6, -v1, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v3, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, -v3, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, -v0, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v2, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v0, -v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, -v4, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v4, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v1, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, -v5, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v5, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <2 x float> %a
   %b.fneg = fneg <2 x float> %b
   %c.fneg = fneg <2 x float> %c
@@ -369,6 +759,40 @@ define <2 x float> @v_fmaximum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, 2.0, v2
 ; GFX12-NEXT:    v_maximum3_f32 v1, v1, 2.0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f32__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 2.0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 2.0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 2.0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 2.0, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> <float 2.0, float 2.0>)
   %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c)
   ret <2 x float> %max1
@@ -385,6 +809,20 @@ define <2 x float> @v_fmaximum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b
 ; GFX12-NEXT:    v_maximum_f32 v0, v4, 4.0
 ; GFX12-NEXT:    v_maximum_f32 v1, v5, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f32__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 4.0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, 4.0, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 4.0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 4.0, v5, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
   %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %c, <2 x float> <float 4.0, float 4.0>)
   ret <2 x float> %max1
@@ -402,6 +840,72 @@ define <3 x float> @v_fmaximum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float
 ; GFX12-NEXT:    v_maximum3_f32 v1, v7, v1, v4
 ; GFX12-NEXT:    v_maximum3_f32 v2, v8, v2, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v6, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v7, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v8, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
   %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %c, <3 x float> %max0)
   ret <3 x float> %max1
@@ -419,6 +923,72 @@ define <3 x float> @v_fmaximum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3
 ; GFX12-NEXT:    v_maximum3_f32 v1, v1, v4, v7
 ; GFX12-NEXT:    v_maximum3_f32 v2, v2, v5, v8
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v8, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
   %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c)
   ret <3 x float> %max1
@@ -436,6 +1006,72 @@ define <3 x float> @v_fmaximum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b,
 ; GFX12-NEXT:    v_maximum3_f32 v1, |v1|, |v4|, |v7|
 ; GFX12-NEXT:    v_maximum3_f32 v2, |v2|, |v5|, |v8|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f32__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, |v2|, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v2|, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v10, |v9|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v2|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v9, |v2|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v5|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, |v5|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v1|, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v10, |v5|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v1|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, |v1|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v4|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v4|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v0|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, |v4|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v0|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, |v0|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v3|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v3|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v0, |v6|
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, |v6|, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v6|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v6|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v6|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v1, |v7|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, |v7|, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v7|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v7|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v7|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v2, |v8|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, |v8|, v2, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v2, |v8|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v8|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, |v8|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a)
   %b.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %b)
   %c.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %c)
@@ -456,6 +1092,72 @@ define <3 x float> @v_fmaximum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b,
 ; GFX12-NEXT:    v_maximum3_f32 v1, -v1, -v4, -v7
 ; GFX12-NEXT:    v_maximum3_f32 v2, -v2, -v5, -v8
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f32__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, -v2, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v2, -v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v10, -v9, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v9, -v2, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -v5, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, -v1, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v1, -v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v10, -v5, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v4, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, -v0, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v0, -v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, -v4, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, -v0, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v3, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v0, -v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, -v6, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v6, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v6, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v1, -v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, -v7, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v7, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v7, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v2, -v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, -v8, v2, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v2, -v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v8, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -v8, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <3 x float> %a
   %b.fneg = fneg <3 x float> %b
   %c.fneg = fneg <3 x float> %c
@@ -476,6 +1178,54 @@ define <3 x float> @v_fmaximum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c
 ; GFX12-NEXT:    v_maximum3_f32 v1, v1, 2.0, v4
 ; GFX12-NEXT:    v_maximum3_f32 v2, v2, 2.0, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f32__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 2.0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 2.0, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 2.0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 2.0, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 2.0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 2.0, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> <float 2.0, float 2.0, float 2.0>)
   %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c)
   ret <3 x float> %max1
@@ -493,6 +1243,24 @@ define <3 x float> @v_fmaximum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b
 ; GFX12-NEXT:    v_maximum_f32 v1, v7, 4.0
 ; GFX12-NEXT:    v_maximum_f32 v2, v8, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f32__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 4.0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, 4.0, v6, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 4.0, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 4.0, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 4.0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 4.0, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
   %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %c, <3 x float> <float 4.0, float 4.0, float 4.0>)
   ret <3 x float> %max1
@@ -509,6 +1277,18 @@ define half @v_fmaximum3_f16(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
   ret half %max1
@@ -524,6 +1304,18 @@ define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f16 v0, v2, v0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, v2, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
   %max1 = call half @llvm.maximum.f16(half %c, half %max0)
   ret half %max1
@@ -539,6 +1331,20 @@ define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg %
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fmaximum3_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_max_f16_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
   %cast = bitcast half %max1 to i16
@@ -557,6 +1363,18 @@ define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f16 v0, |v0|, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fabs0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, |v0|, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
   %max0 = call half @llvm.maximum.f16(half %a.fabs, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -573,6 +1391,18 @@ define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, |v1|, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fabs1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, v0, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b.fabs = call half @llvm.fabs.f16(half %b)
   %max0 = call half @llvm.maximum.f16(half %a, half %b.fabs)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -589,6 +1419,18 @@ define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, |v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fabs2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %c.fabs = call half @llvm.fabs.f16(half %c)
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c.fabs)
@@ -605,6 +1447,18 @@ define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f16 v0, |v0|, |v1|, |v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, |v0|, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
   %b.fabs = call half @llvm.fabs.f16(half %b)
   %c.fabs = call half @llvm.fabs.f16(half %c)
@@ -623,6 +1477,18 @@ define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f16 v0, -v0, -v1, -v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, -v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg half %a
   %b.fneg = fneg half %b
   %c.fneg = fneg half %c
@@ -641,6 +1507,18 @@ define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f16 v0, -|v0|, -|v1|, -|v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fneg_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, -|v0|, -|v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -|v0|, -|v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e64 v1, v0, -|v2|
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -|v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
   %b.fabs = call half @llvm.fabs.f16(half %b)
   %c.fabs = call half @llvm.fabs.f16(half %c)
@@ -662,6 +1540,18 @@ define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f16 v0, -v0, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fneg0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, -v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg half %a
   %max0 = call half @llvm.maximum.f16(half %a.fneg, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -678,6 +1568,18 @@ define half @v_fmaximum3_f16_fneg1(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, -v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fneg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b.fneg = fneg half %b
   %max0 = call half @llvm.maximum.f16(half %a, half %b.fneg)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -694,6 +1596,18 @@ define half @v_fmaximum3_f16_fneg2(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, -v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fneg2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %c.fneg = fneg half %c
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c.fneg)
@@ -710,6 +1624,18 @@ define half @v_fmaximum3_f16_const0(half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, 0x4800, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_const0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v2, 0x4800, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half 8.0, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
   ret half %max1
@@ -725,6 +1651,18 @@ define half @v_fmaximum3_f16__const2(half %a, half %b) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, 0x4800
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16__const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, 0x4800, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half 8.0)
   ret half %max1
@@ -740,6 +1678,18 @@ define half @v_fmaximum3_f16_inlineimm0(half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, 4.0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_inlineimm0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v2, 4.0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half 4.0, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half %c)
   ret half %max1
@@ -755,6 +1705,18 @@ define half @v_fmaximum3_f16__inlineimm(half %a, half %b) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16__inlineimm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, 4.0, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half %b)
   %max1 = call half @llvm.maximum.f16(half %max0, half 4.0)
   ret half %max1
@@ -772,6 +1734,18 @@ define half @v_fmaximum3_f16_const1_const2(half %a) {
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_maximum3_f16 v0, v0, s0, 0x4c00
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_const1_const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v1, 0x4800, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, 0x4c00, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.maximum.f16(half %a, half 8.0)
   %max1 = call half @llvm.maximum.f16(half %max0, half 16.0)
   ret half %max1
@@ -789,6 +1763,27 @@ define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_pk_maximum_f16 v0, v2, v0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v2, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max0)
   ret <2 x half> %max1
@@ -806,6 +1801,27 @@ define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c)
   ret <2 x half> %max1
@@ -826,6 +1842,30 @@ define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2
 ; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
 ; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f16__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v1
+; GFX9-NEXT:    v_pk_max_f16 v3, v3, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v2
+; GFX9-NEXT:    v_perm_b32 v1, v4, v0, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
   %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b)
   %c.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %c)
@@ -846,6 +1886,27 @@ define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f16__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v5, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <2 x half> %a
   %b.fneg = fneg <2 x half> %b
   %c.fneg = fneg <2 x half> %c
@@ -866,6 +1927,27 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f16__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v2, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v3, v0, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 2.0>)
   %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c)
   ret <2 x half> %max1
@@ -881,6 +1963,20 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b, <
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_pk_maximum_f16 v0, v2, 4.0 op_sel_hi:[1,0]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f16__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v2, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> <half 4.0, half 4.0>)
   ret <2 x half> %max1
@@ -900,6 +1996,37 @@ define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c
 ; GFX12-NEXT:    v_pk_maximum_f16 v0, v4, v0
 ; GFX12-NEXT:    v_pk_maximum_f16 v1, v5, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v5, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v6
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v4, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
   %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %c, <3 x half> %max0)
   ret <3 x half> %max1
@@ -919,6 +2046,37 @@ define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x
 ; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v4
 ; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v5
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v8, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
   %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c)
   ret <3 x half> %max1
@@ -945,6 +2103,43 @@ define <3 x half> @v_fmaximum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3
 ; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v4
 ; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f16__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v7, 0x7fff7fff, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0x7fff7fff, v3
+; GFX9-NEXT:    v_and_b32_e32 v6, 0x7fff7fff, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0x7fff7fff, v2
+; GFX9-NEXT:    v_pk_max_f16 v7, v7, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_pk_max_f16 v6, v6, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v6, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_and_b32_e32 v11, 0x7fff7fff, v4
+; GFX9-NEXT:    v_perm_b32 v2, v8, v0, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v11
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x7fff7fff, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_perm_b32 v6, v9, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX9-NEXT:    v_pk_max_f16 v6, v6, v10
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v6, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a)
   %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b)
   %c.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %c)
@@ -967,6 +2162,37 @@ define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3
 ; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f16__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v6, -v5
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v8, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <3 x half> %a
   %b.fneg = fneg <3 x half> %b
   %c.fneg = fneg <3 x half> %c
@@ -989,6 +2215,35 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
 ; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
 ; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f16__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX9-NEXT:    v_pk_max_f16 v7, v1, 2.0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX9-NEXT:    s_mov_b32 s5, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v4, v5, v0, s5
+; GFX9-NEXT:    v_pk_max_f16 v4, v4, v2
+; GFX9-NEXT:    s_movk_i32 s4, 0x7e00
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX9-NEXT:    v_pack_b32_f16 v7, v1, s4
+; GFX9-NEXT:    v_pk_max_f16 v7, v7, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> <half 2.0, half 2.0, half 2.0>)
   %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c)
   ret <3 x half> %max1
@@ -1005,6 +2260,23 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b, <
 ; GFX12-NEXT:    v_pk_maximum_f16 v0, v4, 4.0 op_sel_hi:[1,0]
 ; GFX12-NEXT:    v_pk_maximum_f16 v1, v5, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f16__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v4, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v1, vcc
+; GFX9-NEXT:    v_pk_max_f16 v1, v5, 4.0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
   %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %c, <3 x half> <half 4.0, half 4.0, half 4.0>)
   ret <3 x half> %max1
@@ -1022,6 +2294,20 @@ define double @v_fmaximum3_f64(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call double @llvm.maximum.f64(double %a, double %b)
   %max1 = call double @llvm.maximum.f64(double %max0, double %c)
   ret double %max1
@@ -1039,6 +2325,20 @@ define double @v_fmaximum3_f64_commute(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_maximum_f64 v[0:1], v[4:5], v[0:1]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[4:5], v[0:1]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call double @llvm.maximum.f64(double %a, double %b)
   %max1 = call double @llvm.maximum.f64(double %c, double %max0)
   ret double %max1
@@ -1054,6 +2354,23 @@ define amdgpu_ps <2 x i32> @s_fmaximum3_f64(double inreg %a, double inreg %b, do
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fmaximum3_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    ; return to shader part epilog
   %max0 = call double @llvm.maximum.f64(double %a, double %b)
   %max1 = call double @llvm.maximum.f64(double %max0, double %c)
   %cast = bitcast double %max1 to <2 x i32>
@@ -1078,6 +2395,20 @@ define double @v_fmaximum3_f64_fabs0(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fabs0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], |v[0:1]|, v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call double @llvm.fabs.f64(double %a)
   %max0 = call double @llvm.maximum.f64(double %a.fabs, double %b)
   %max1 = call double @llvm.maximum.f64(double %max0, double %c)
@@ -1096,6 +2427,20 @@ define double @v_fmaximum3_f64_fabs1(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fabs1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], |v[2:3]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]|
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b.fabs = call double @llvm.fabs.f64(double %b)
   %max0 = call double @llvm.maximum.f64(double %a, double %b.fabs)
   %max1 = call double @llvm.maximum.f64(double %max0, double %c)
@@ -1114,6 +2459,20 @@ define double @v_fmaximum3_f64_fabs2(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], |v[4:5]|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fabs2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %c.fabs = call double @llvm.fabs.f64(double %c)
   %max0 = call double @llvm.maximum.f64(double %a, double %b)
   %max1 = call double @llvm.maximum.f64(double %max0, double %c.fabs)
@@ -1132,6 +2491,20 @@ define double @v_fmaximum3_f64_fabs_all(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], |v[4:5]|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], |v[0:1]|, |v[2:3]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]|
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call double @llvm.fabs.f64(double %a)
   %b.fabs = call double @llvm.fabs.f64(double %b)
   %c.fabs = call double @llvm.fabs.f64(double %c)
@@ -1152,6 +2525,20 @@ define double @v_fmaximum3_f64_fneg_all(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], -v[4:5]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], -v[0:1], -v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg double %a
   %b.fneg = fneg double %b
   %c.fneg = fneg double %c
@@ -1172,6 +2559,20 @@ define double @v_fmaximum3_f64_fneg_fabs_all(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], -|v[4:5]|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fneg_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], -|v[0:1]|, -|v[2:3]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]|
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], -|v[4:5]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]|
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call double @llvm.fabs.f64(double %a)
   %b.fabs = call double @llvm.fabs.f64(double %b)
   %c.fabs = call double @llvm.fabs.f64(double %c)
@@ -1195,6 +2596,20 @@ define double @v_fmaximum3_f64_fneg0(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fneg0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], -v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg double %a
   %max0 = call double @llvm.maximum.f64(double %a.fneg, double %b)
   %max1 = call double @llvm.maximum.f64(double %max0, double %c)
@@ -1213,6 +2628,20 @@ define double @v_fmaximum3_f64_fneg1(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fneg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], -v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b.fneg = fneg double %b
   %max0 = call double @llvm.maximum.f64(double %a, double %b.fneg)
   %max1 = call double @llvm.maximum.f64(double %max0, double %c)
@@ -1231,6 +2660,20 @@ define double @v_fmaximum3_f64_fneg2(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], -v[4:5]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fneg2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %c.fneg = fneg double %c
   %max0 = call double @llvm.maximum.f64(double %a, double %b)
   %max1 = call double @llvm.maximum.f64(double %max0, double %c.fneg)
@@ -1249,6 +2692,22 @@ define double @v_fmaximum3_f64_const0(double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_const0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call double @llvm.maximum.f64(double 8.0, double %b)
   %max1 = call double @llvm.maximum.f64(double %max0, double %c)
   ret double %max1
@@ -1266,6 +2725,22 @@ define double @v_fmaximum3_f64__const2(double %a, double %b) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_maximum_f64 v[0:1], 0x40200000, v[0:1]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64__const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call double @llvm.maximum.f64(double %a, double %b)
   %max1 = call double @llvm.maximum.f64(double %max0, double 8.0)
   ret double %max1
@@ -1283,6 +2758,20 @@ define double @v_fmaximum3_f64_inlineimm0(double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_inlineimm0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], 4.0
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call double @llvm.maximum.f64(double 4.0, double %b)
   %max1 = call double @llvm.maximum.f64(double %max0, double %c)
   ret double %max1
@@ -1300,6 +2789,20 @@ define double @v_fmaximum3_f64__inlineimm(double %a, double %b) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64__inlineimm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], 4.0
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call double @llvm.maximum.f64(double %a, double %b)
   %max1 = call double @llvm.maximum.f64(double %max0, double 4.0)
   ret double %max1
@@ -1317,6 +2820,24 @@ define double @v_fmaximum3_f64_const1_const2(double %a) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_maximum_f64 v[0:1], 0x40300000, v[0:1]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_const1_const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40300000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call double @llvm.maximum.f64(double %a, double 8.0)
   %max1 = call double @llvm.maximum.f64(double %max0, double 16.0)
   ret double %max1
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index baa6b0d7b8a19..ec2a71dcaecb3 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -enable-var-scope -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
 
 declare half @llvm.fabs.f16(half)
 declare float @llvm.fabs.f32(float)
@@ -23,6 +24,18 @@ define float @v_fminimum3_f32(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c)
   ret float %max1
@@ -38,6 +51,18 @@ define float @v_fminimum3_f32_commute(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f32 v0, v2, v0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, v2, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
   %max1 = call float @llvm.minimum.f32(float %c, float %max0)
   ret float %max1
@@ -51,6 +76,19 @@ define amdgpu_ps i32 @s_fminimum3_f32(float inreg %a, float inreg %b, float inre
 ; GFX12-NEXT:    v_minimum3_f32 v0, s0, s1, v0
 ; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fminimum3_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c)
   %cast = bitcast float %max1 to i32
@@ -68,6 +106,18 @@ define float @v_fminimum3_f32_fabs0(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f32 v0, |v0|, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fabs0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, |v0|, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call float @llvm.fabs.f32(float %a)
   %max0 = call float @llvm.minimum.f32(float %a.fabs, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c)
@@ -84,6 +134,18 @@ define float @v_fminimum3_f32_fabs1(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, |v1|, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fabs1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, v0, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b.fabs = call float @llvm.fabs.f32(float %b)
   %max0 = call float @llvm.minimum.f32(float %a, float %b.fabs)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c)
@@ -100,6 +162,18 @@ define float @v_fminimum3_f32_fabs2(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, |v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fabs2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %c.fabs = call float @llvm.fabs.f32(float %c)
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c.fabs)
@@ -116,6 +190,18 @@ define float @v_fminimum3_f32_fabs_all(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f32 v0, |v0|, |v1|, |v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, |v0|, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call float @llvm.fabs.f32(float %a)
   %b.fabs = call float @llvm.fabs.f32(float %b)
   %c.fabs = call float @llvm.fabs.f32(float %c)
@@ -134,6 +220,18 @@ define float @v_fminimum3_f32_fneg_all(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f32 v0, -v0, -v1, -v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, -v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg float %a
   %b.fneg = fneg float %b
   %c.fneg = fneg float %c
@@ -152,6 +250,18 @@ define float @v_fminimum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f32 v0, -|v0|, -|v1|, -|v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fneg_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, -|v0|, -|v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -|v0|, -|v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e64 v1, v0, -|v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -|v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call float @llvm.fabs.f32(float %a)
   %b.fabs = call float @llvm.fabs.f32(float %b)
   %c.fabs = call float @llvm.fabs.f32(float %c)
@@ -173,6 +283,18 @@ define float @v_fminimum3_f32_fneg0(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f32 v0, -v0, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fneg0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, -v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg float %a
   %max0 = call float @llvm.minimum.f32(float %a.fneg, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c)
@@ -189,6 +311,18 @@ define float @v_fminimum3_f32_fneg1(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, -v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fneg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b.fneg = fneg float %b
   %max0 = call float @llvm.minimum.f32(float %a, float %b.fneg)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c)
@@ -205,6 +339,18 @@ define float @v_fminimum3_f32_fneg2(float %a, float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, -v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fneg2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %c.fneg = fneg float %c
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c.fneg)
@@ -221,6 +367,18 @@ define float @v_fminimum3_f32_const0(float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, 0x41000000, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_const0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v2, 0x41000000, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float 8.0, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c)
   ret float %max1
@@ -236,6 +394,18 @@ define float @v_fminimum3_f32__const2(float %a, float %b) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, 0x41000000
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32__const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, 0x41000000, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float 8.0)
   ret float %max1
@@ -251,6 +421,18 @@ define float @v_fminimum3_f32_inlineimm0(float %b, float %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, 4.0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_inlineimm0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v2, 4.0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float 4.0, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c)
   ret float %max1
@@ -266,6 +448,18 @@ define float @v_fminimum3_f32__inlineimm(float %a, float %b) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32__inlineimm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, 4.0, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float 4.0)
   ret float %max1
@@ -283,6 +477,18 @@ define float @v_fminimum3_f32_const1_const2(float %a) {
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, s0, 0x41800000
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_const1_const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v1, 0x41000000, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, 0x41800000, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float %a, float 8.0)
   %max1 = call float @llvm.minimum.f32(float %max0, float 16.0)
   ret float %max1
@@ -299,6 +505,52 @@ define <2 x float> @v_fminimum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float
 ; GFX12-NEXT:    v_minimum3_f32 v0, v4, v0, v2
 ; GFX12-NEXT:    v_minimum3_f32 v1, v5, v1, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v5, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
   %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %c, <2 x float> %max0)
   ret <2 x float> %max1
@@ -315,6 +567,52 @@ define <2 x float> @v_fminimum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, v2, v4
 ; GFX12-NEXT:    v_minimum3_f32 v1, v1, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
   %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c)
   ret <2 x float> %max1
@@ -331,6 +629,52 @@ define <2 x float> @v_fminimum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b,
 ; GFX12-NEXT:    v_minimum3_f32 v0, |v0|, |v2|, |v4|
 ; GFX12-NEXT:    v_minimum3_f32 v1, |v1|, |v3|, |v5|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f32__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v7, |v6|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v1|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v6, |v1|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v3|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v3|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, |v3|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v0|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, |v0|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v2|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v2|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, |v4|, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v4|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v4|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, |v5|, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v5|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v5|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
   %b.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %b)
   %c.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %c)
@@ -350,6 +694,52 @@ define <2 x float> @v_fminimum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b,
 ; GFX12-NEXT:    v_minimum3_f32 v0, -v0, -v2, -v4
 ; GFX12-NEXT:    v_minimum3_f32 v1, -v1, -v3, -v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f32__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v7, -v6, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v6, -v1, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v3, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, -v3, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, -v0, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v2, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v0, -v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, -v4, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v4, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, -v5, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v5, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <2 x float> %a
   %b.fneg = fneg <2 x float> %b
   %c.fneg = fneg <2 x float> %c
@@ -369,6 +759,40 @@ define <2 x float> @v_fminimum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, 2.0, v2
 ; GFX12-NEXT:    v_minimum3_f32 v1, v1, 2.0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f32__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 2.0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 2.0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 2.0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 2.0, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> <float 2.0, float 2.0>)
   %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c)
   ret <2 x float> %max1
@@ -385,6 +809,20 @@ define <2 x float> @v_fminimum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b
 ; GFX12-NEXT:    v_minimum_f32 v0, v4, 4.0
 ; GFX12-NEXT:    v_minimum_f32 v1, v5, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f32__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 4.0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, 4.0, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 4.0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 4.0, v5, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
   %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %c, <2 x float> <float 4.0, float 4.0>)
   ret <2 x float> %max1
@@ -402,6 +840,72 @@ define <3 x float> @v_fminimum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float
 ; GFX12-NEXT:    v_minimum3_f32 v1, v7, v1, v4
 ; GFX12-NEXT:    v_minimum3_f32 v2, v8, v2, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v6, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v7, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v8, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
   %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %c, <3 x float> %max0)
   ret <3 x float> %max1
@@ -419,6 +923,72 @@ define <3 x float> @v_fminimum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3
 ; GFX12-NEXT:    v_minimum3_f32 v1, v1, v4, v7
 ; GFX12-NEXT:    v_minimum3_f32 v2, v2, v5, v8
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v8, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
   %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c)
   ret <3 x float> %max1
@@ -436,6 +1006,72 @@ define <3 x float> @v_fminimum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b,
 ; GFX12-NEXT:    v_minimum3_f32 v1, |v1|, |v4|, |v7|
 ; GFX12-NEXT:    v_minimum3_f32 v2, |v2|, |v5|, |v8|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f32__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v2|, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v2|, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v10, |v9|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v2|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v9, |v2|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v5|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, |v5|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v1|, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v10, |v5|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v1|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, |v1|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v4|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v4|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v0|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, |v4|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v0|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, |v0|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v3|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v3|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v0, |v6|
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, |v6|, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v6|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v6|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v6|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, |v7|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, |v7|, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v7|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v7|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v7|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v2, |v8|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, |v8|, v2, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v2, |v8|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v8|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, |v8|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a)
   %b.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %b)
   %c.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %c)
@@ -456,6 +1092,72 @@ define <3 x float> @v_fminimum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b,
 ; GFX12-NEXT:    v_minimum3_f32 v1, -v1, -v4, -v7
 ; GFX12-NEXT:    v_minimum3_f32 v2, -v2, -v5, -v8
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f32__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, -v2, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v2, -v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v10, -v9, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v9, -v2, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -v5, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, -v1, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v1, -v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v10, -v5, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v4, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v0, -v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, -v4, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, -v0, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v3, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v0, -v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, -v6, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v6, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v6, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, -v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, -v7, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v7, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v7, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v2, -v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, -v8, v2, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v2, -v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v8, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -v8, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <3 x float> %a
   %b.fneg = fneg <3 x float> %b
   %c.fneg = fneg <3 x float> %c
@@ -476,6 +1178,54 @@ define <3 x float> @v_fminimum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c
 ; GFX12-NEXT:    v_minimum3_f32 v1, v1, 2.0, v4
 ; GFX12-NEXT:    v_minimum3_f32 v2, v2, 2.0, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f32__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 2.0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 2.0, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 2.0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 2.0, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 2.0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 2.0, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> <float 2.0, float 2.0, float 2.0>)
   %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c)
   ret <3 x float> %max1
@@ -493,6 +1243,24 @@ define <3 x float> @v_fminimum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b
 ; GFX12-NEXT:    v_minimum_f32 v1, v7, 4.0
 ; GFX12-NEXT:    v_minimum_f32 v2, v8, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f32__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 4.0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, 4.0, v6, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 4.0, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 4.0, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 4.0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 4.0, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
   %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %c, <3 x float> <float 4.0, float 4.0, float 4.0>)
   ret <3 x float> %max1
@@ -509,6 +1277,18 @@ define half @v_fminimum3_f16(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c)
   ret half %max1
@@ -524,6 +1304,18 @@ define half @v_fminimum3_f16_commute(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f16 v0, v2, v0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, v2, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
   %max1 = call half @llvm.minimum.f16(half %c, half %max0)
   ret half %max1
@@ -539,6 +1331,20 @@ define amdgpu_ps i32 @s_fminimum3_f16(half inreg %a, half inreg %b, half inreg %
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fminimum3_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_min_f16_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c)
   %cast = bitcast half %max1 to i16
@@ -557,6 +1363,18 @@ define half @v_fminimum3_f16_fabs0(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f16 v0, |v0|, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fabs0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, |v0|, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
   %max0 = call half @llvm.minimum.f16(half %a.fabs, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c)
@@ -573,6 +1391,18 @@ define half @v_fminimum3_f16_fabs1(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, |v1|, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fabs1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, v0, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b.fabs = call half @llvm.fabs.f16(half %b)
   %max0 = call half @llvm.minimum.f16(half %a, half %b.fabs)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c)
@@ -589,6 +1419,18 @@ define half @v_fminimum3_f16_fabs2(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, |v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fabs2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %c.fabs = call half @llvm.fabs.f16(half %c)
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c.fabs)
@@ -605,6 +1447,18 @@ define half @v_fminimum3_f16_fabs_all(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f16 v0, |v0|, |v1|, |v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, |v0|, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
   %b.fabs = call half @llvm.fabs.f16(half %b)
   %c.fabs = call half @llvm.fabs.f16(half %c)
@@ -623,6 +1477,18 @@ define half @v_fminimum3_f16_fneg_all(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f16 v0, -v0, -v1, -v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, -v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg half %a
   %b.fneg = fneg half %b
   %c.fneg = fneg half %c
@@ -641,6 +1507,18 @@ define half @v_fminimum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f16 v0, -|v0|, -|v1|, -|v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fneg_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, -|v0|, -|v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -|v0|, -|v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e64 v1, v0, -|v2|
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -|v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
   %b.fabs = call half @llvm.fabs.f16(half %b)
   %c.fabs = call half @llvm.fabs.f16(half %c)
@@ -662,6 +1540,18 @@ define half @v_fminimum3_f16_fneg0(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f16 v0, -v0, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fneg0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, -v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg half %a
   %max0 = call half @llvm.minimum.f16(half %a.fneg, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c)
@@ -678,6 +1568,18 @@ define half @v_fminimum3_f16_fneg1(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, -v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fneg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b.fneg = fneg half %b
   %max0 = call half @llvm.minimum.f16(half %a, half %b.fneg)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c)
@@ -694,6 +1596,18 @@ define half @v_fminimum3_f16_fneg2(half %a, half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, -v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fneg2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %c.fneg = fneg half %c
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c.fneg)
@@ -710,6 +1624,18 @@ define half @v_fminimum3_f16_const0(half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, 0x4800, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_const0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v2, 0x4800, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half 8.0, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c)
   ret half %max1
@@ -725,6 +1651,18 @@ define half @v_fminimum3_f16__const2(half %a, half %b) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, 0x4800
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16__const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, 0x4800, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half 8.0)
   ret half %max1
@@ -740,6 +1678,18 @@ define half @v_fminimum3_f16_inlineimm0(half %b, half %c) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, 4.0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_inlineimm0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v2, 4.0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half 4.0, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half %c)
   ret half %max1
@@ -755,6 +1705,18 @@ define half @v_fminimum3_f16__inlineimm(half %a, half %b) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16__inlineimm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, 4.0, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half %a, half %b)
   %max1 = call half @llvm.minimum.f16(half %max0, half 4.0)
   ret half %max1
@@ -772,6 +1734,18 @@ define half @v_fminimum3_f16_const1_const2(half %a) {
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_minimum3_f16 v0, v0, s0, 0x4c00
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_const1_const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v1, 0x4800, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, 0x4c00, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call half @llvm.minimum.f16(half %a, half 8.0)
   %max1 = call half @llvm.minimum.f16(half %max0, half 16.0)
   ret half %max1
@@ -789,6 +1763,27 @@ define <2 x half> @v_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_pk_minimum_f16 v0, v2, v0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v2, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %c, <2 x half> %max0)
   ret <2 x half> %max1
@@ -806,6 +1801,27 @@ define <2 x half> @v_fminimum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c)
   ret <2 x half> %max1
@@ -826,6 +1842,30 @@ define <2 x half> @v_fminimum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2
 ; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
 ; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f16__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v1
+; GFX9-NEXT:    v_pk_min_f16 v3, v3, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v2
+; GFX9-NEXT:    v_perm_b32 v1, v4, v0, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
   %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b)
   %c.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %c)
@@ -846,6 +1886,27 @@ define <2 x half> @v_fminimum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f16__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v5, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <2 x half> %a
   %b.fneg = fneg <2 x half> %b
   %c.fneg = fneg <2 x half> %c
@@ -866,6 +1927,27 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f16__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v2, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v3, v0, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 2.0>)
   %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c)
   ret <2 x half> %max1
@@ -881,6 +1963,20 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b, <
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_pk_minimum_f16 v0, v2, 4.0 op_sel_hi:[1,0]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f16__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v2, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
   %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %c, <2 x half> <half 4.0, half 4.0>)
   ret <2 x half> %max1
@@ -900,6 +1996,37 @@ define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c
 ; GFX12-NEXT:    v_pk_minimum_f16 v0, v4, v0
 ; GFX12-NEXT:    v_pk_minimum_f16 v1, v5, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v5, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v6
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v4, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
   %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %c, <3 x half> %max0)
   ret <3 x half> %max1
@@ -919,6 +2046,37 @@ define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x
 ; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v4
 ; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v5
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v8, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
   %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c)
   ret <3 x half> %max1
@@ -945,6 +2103,43 @@ define <3 x half> @v_fminimum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3
 ; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v4
 ; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f16__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v7, 0x7fff7fff, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0x7fff7fff, v3
+; GFX9-NEXT:    v_and_b32_e32 v6, 0x7fff7fff, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0x7fff7fff, v2
+; GFX9-NEXT:    v_pk_min_f16 v7, v7, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_pk_min_f16 v6, v6, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v6, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_and_b32_e32 v11, 0x7fff7fff, v4
+; GFX9-NEXT:    v_perm_b32 v2, v8, v0, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v11
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x7fff7fff, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_perm_b32 v6, v9, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX9-NEXT:    v_pk_min_f16 v6, v6, v10
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v6, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a)
   %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b)
   %c.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %c)
@@ -967,6 +2162,37 @@ define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3
 ; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f16__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v6, -v5
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v8, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <3 x half> %a
   %b.fneg = fneg <3 x half> %b
   %c.fneg = fneg <3 x half> %c
@@ -989,6 +2215,35 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
 ; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
 ; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f16__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX9-NEXT:    v_pk_min_f16 v7, v1, 2.0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX9-NEXT:    s_mov_b32 s5, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v4, v5, v0, s5
+; GFX9-NEXT:    v_pk_min_f16 v4, v4, v2
+; GFX9-NEXT:    s_movk_i32 s4, 0x7e00
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX9-NEXT:    v_pack_b32_f16 v7, v1, s4
+; GFX9-NEXT:    v_pk_min_f16 v7, v7, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> <half 2.0, half 2.0, half 2.0>)
   %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c)
   ret <3 x half> %max1
@@ -1005,6 +2260,23 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b, <
 ; GFX12-NEXT:    v_pk_minimum_f16 v0, v4, 4.0 op_sel_hi:[1,0]
 ; GFX12-NEXT:    v_pk_minimum_f16 v1, v5, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f16__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v4, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v1, vcc
+; GFX9-NEXT:    v_pk_min_f16 v1, v5, 4.0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
   %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %c, <3 x half> <half 4.0, half 4.0, half 4.0>)
   ret <3 x half> %max1
@@ -1022,6 +2294,20 @@ define double @v_fminimum3_f64(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call double @llvm.minimum.f64(double %a, double %b)
   %max1 = call double @llvm.minimum.f64(double %max0, double %c)
   ret double %max1
@@ -1039,6 +2325,20 @@ define double @v_fminimum3_f64_commute(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_minimum_f64 v[0:1], v[4:5], v[0:1]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[4:5], v[0:1]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call double @llvm.minimum.f64(double %a, double %b)
   %max1 = call double @llvm.minimum.f64(double %c, double %max0)
   ret double %max1
@@ -1054,6 +2354,23 @@ define amdgpu_ps <2 x i32> @s_fminimum3_f64(double inreg %a, double inreg %b, do
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fminimum3_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    ; return to shader part epilog
   %max0 = call double @llvm.minimum.f64(double %a, double %b)
   %max1 = call double @llvm.minimum.f64(double %max0, double %c)
   %cast = bitcast double %max1 to <2 x i32>
@@ -1078,6 +2395,20 @@ define double @v_fminimum3_f64_fabs0(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fabs0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], |v[0:1]|, v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call double @llvm.fabs.f64(double %a)
   %max0 = call double @llvm.minimum.f64(double %a.fabs, double %b)
   %max1 = call double @llvm.minimum.f64(double %max0, double %c)
@@ -1096,6 +2427,20 @@ define double @v_fminimum3_f64_fabs1(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fabs1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], |v[2:3]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]|
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b.fabs = call double @llvm.fabs.f64(double %b)
   %max0 = call double @llvm.minimum.f64(double %a, double %b.fabs)
   %max1 = call double @llvm.minimum.f64(double %max0, double %c)
@@ -1114,6 +2459,20 @@ define double @v_fminimum3_f64_fabs2(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], |v[4:5]|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fabs2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %c.fabs = call double @llvm.fabs.f64(double %c)
   %max0 = call double @llvm.minimum.f64(double %a, double %b)
   %max1 = call double @llvm.minimum.f64(double %max0, double %c.fabs)
@@ -1132,6 +2491,20 @@ define double @v_fminimum3_f64_fabs_all(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], |v[4:5]|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], |v[0:1]|, |v[2:3]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]|
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call double @llvm.fabs.f64(double %a)
   %b.fabs = call double @llvm.fabs.f64(double %b)
   %c.fabs = call double @llvm.fabs.f64(double %c)
@@ -1152,6 +2525,20 @@ define double @v_fminimum3_f64_fneg_all(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], -v[4:5]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], -v[0:1], -v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg double %a
   %b.fneg = fneg double %b
   %c.fneg = fneg double %c
@@ -1172,6 +2559,20 @@ define double @v_fminimum3_f64_fneg_fabs_all(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], -|v[4:5]|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fneg_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], -|v[0:1]|, -|v[2:3]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]|
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], -|v[4:5]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]|
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call double @llvm.fabs.f64(double %a)
   %b.fabs = call double @llvm.fabs.f64(double %b)
   %c.fabs = call double @llvm.fabs.f64(double %c)
@@ -1195,6 +2596,20 @@ define double @v_fminimum3_f64_fneg0(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fneg0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], -v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg double %a
   %max0 = call double @llvm.minimum.f64(double %a.fneg, double %b)
   %max1 = call double @llvm.minimum.f64(double %max0, double %c)
@@ -1213,6 +2628,20 @@ define double @v_fminimum3_f64_fneg1(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fneg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], -v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b.fneg = fneg double %b
   %max0 = call double @llvm.minimum.f64(double %a, double %b.fneg)
   %max1 = call double @llvm.minimum.f64(double %max0, double %c)
@@ -1231,6 +2660,20 @@ define double @v_fminimum3_f64_fneg2(double %a, double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], -v[4:5]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fneg2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %c.fneg = fneg double %c
   %max0 = call double @llvm.minimum.f64(double %a, double %b)
   %max1 = call double @llvm.minimum.f64(double %max0, double %c.fneg)
@@ -1249,6 +2692,22 @@ define double @v_fminimum3_f64_const0(double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_const0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call double @llvm.minimum.f64(double 8.0, double %b)
   %max1 = call double @llvm.minimum.f64(double %max0, double %c)
   ret double %max1
@@ -1266,6 +2725,22 @@ define double @v_fminimum3_f64__const2(double %a, double %b) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_minimum_f64 v[0:1], 0x40200000, v[0:1]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64__const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call double @llvm.minimum.f64(double %a, double %b)
   %max1 = call double @llvm.minimum.f64(double %max0, double 8.0)
   ret double %max1
@@ -1283,6 +2758,20 @@ define double @v_fminimum3_f64_inlineimm0(double %b, double %c) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_inlineimm0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], 4.0
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call double @llvm.minimum.f64(double 4.0, double %b)
   %max1 = call double @llvm.minimum.f64(double %max0, double %c)
   ret double %max1
@@ -1300,6 +2789,20 @@ define double @v_fminimum3_f64__inlineimm(double %a, double %b) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64__inlineimm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], 4.0
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call double @llvm.minimum.f64(double %a, double %b)
   %max1 = call double @llvm.minimum.f64(double %max0, double 4.0)
   ret double %max1
@@ -1317,6 +2820,24 @@ define double @v_fminimum3_f64_const1_const2(double %a) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_minimum_f64 v[0:1], 0x40300000, v[0:1]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_const1_const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40300000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call double @llvm.minimum.f64(double %a, double 8.0)
   %max1 = call double @llvm.minimum.f64(double %max0, double 16.0)
   ret double %max1



More information about the llvm-commits mailing list