[llvm] AMDGPU: Clean up maximum3/minimum3 tests (PR #93025)

Wed May 22 05:19:00 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

<details>
<summary>Changes</summary>

These were using patterns copied from older tests, before non-kernel functions were supported and manually written checks. Also stop using -flat-for-global, which only exists to try to share tests between SI/CI and VI+.

This was also missing test coverage, we're incorrectly forming maximum3/minimum3 pre-gfx12. This is a pre-commit before fixing that.

---

Patch is 115.76 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93025.diff


2 Files Affected:

- (modified) llvm/test/CodeGen/AMDGPU/fmaximum3.ll (+1318-93) 
- (modified) llvm/test/CodeGen/AMDGPU/fminimum3.ll (+1318-93) 


``````````diff

diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 6e45084dc4b80..fe6de63742b76 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -1,98 +1,1323 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_0_f32:
-; GCN: buffer_load_b32 [[REGC:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGB:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGA:v[0-9]+]]
-; GCN: v_maximum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b32 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile float, ptr addrspace(1) %aptr, align 4
-  %b = load volatile float, ptr addrspace(1) %bptr, align 4
-  %c = load volatile float, ptr addrspace(1) %cptr, align 4
-  %f0 = call float @llvm.maximum.f32(float %a, float %b)
-  %f1 = call float @llvm.maximum.f32(float %f0, float %c)
-  store float %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-; Commute operand of second fmaximum
-; GCN-LABEL: {{^}}test_fmaximum3_olt_1_f32:
-; GCN: buffer_load_b32 [[REGB:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGA:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGC:v[0-9]+]]
-; GCN: v_maximum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b32 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile float, ptr addrspace(1) %aptr, align 4
-  %b = load volatile float, ptr addrspace(1) %bptr, align 4
-  %c = load volatile float, ptr addrspace(1) %cptr, align 4
-  %f0 = call float @llvm.maximum.f32(float %a, float %b)
-  %f1 = call float @llvm.maximum.f32(float %c, float %f0)
-  store float %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_0_f16:
-; GCN: buffer_load_u16 [[REGC:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGB:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGA:v[0-9]+]]
-; GCN: v_maximum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b16 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile half, ptr addrspace(1) %aptr, align 2
-  %b = load volatile half, ptr addrspace(1) %bptr, align 2
-  %c = load volatile half, ptr addrspace(1) %cptr, align 2
-  %f0 = call half @llvm.maximum.f16(half %a, half %b)
-  %f1 = call half @llvm.maximum.f16(half %f0, half %c)
-  store half %f1, ptr addrspace(1) %out, align 2
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_1_f16:
-; GCN: buffer_load_u16 [[REGA:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGB:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGC:v[0-9]+]]
-; GCN: v_maximum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]]
-; GCN: buffer_store_b16 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile half, ptr addrspace(1) %aptr, align 2
-  %b = load volatile half, ptr addrspace(1) %bptr, align 2
-  %c = load volatile half, ptr addrspace(1) %cptr, align 2
-  %f0 = call half @llvm.maximum.f16(half %a, half %b)
-  %f1 = call half @llvm.maximum.f16(half %c, half %f0)
-  store half %f1, ptr addrspace(1) %out, align 2
-  ret void
-}
-
-; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of maximum3
-; since there are no pack instructions for fmaximum3.
-; GCN-LABEL: {{^}}no_fmaximum3_v2f16:
-; GCN: v_pk_maximum_f16 v0, v0, v1
-; GCN: v_pk_maximum_f16 v0, v2, v0
-; GCN: v_pk_maximum_f16 v0, v0, v3
-; GCN-NEXT: s_setpc_b64
-define <2 x half> @no_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
-entry:
-  %max = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
-  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max)
-  %res = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max1, <2 x half> %d)
-  ret <2 x half> %res
-}
-
-; GCN-LABEL: {{^}}no_fmaximum3_olt_0_f64:
-; GCN-COUNT-2: v_maximum_f64
-define amdgpu_kernel void @no_fmaximum3_olt_0_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile double, ptr addrspace(1) %aptr, align 4
-  %b = load volatile double, ptr addrspace(1) %bptr, align 4
-  %c = load volatile double, ptr addrspace(1) %cptr, align 4
-  %f0 = call double @llvm.maximum.f64(double %a, double %b)
-  %f1 = call double @llvm.maximum.f64(double %f0, double %c)
-  store double %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -enable-var-scope -check-prefix=GFX12 %s
+
+declare half @llvm.fabs.f16(half)
+declare float @llvm.fabs.f32(float)
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>)
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
 
 declare double @llvm.maximum.f64(double, double)
 declare float @llvm.maximum.f32(float, float)
 declare half @llvm.maximum.f16(half, half)
 declare <2 x half> @llvm.maximum.v2f16(<2 x half>, <2 x half>)
+declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
+declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
+
+define float @v_fmaximum3_f32(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_commute(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v2, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %c, float %max0)
+  ret float %max1
+}
+
+define amdgpu_ps i32 @s_fmaximum3_f32(float inreg %a, float inreg %b, float inreg %c) {
+; GFX12-LABEL: s_fmaximum3_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum3_f32 v0, s0, s1, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    ; return to shader part epilog
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  %cast = bitcast float %max1 to i32
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+  ret i32 %readfirstlane
+}
+
+define float @v_fmaximum3_f32_fabs0(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %max0 = call float @llvm.maximum.f32(float %a.fabs, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs1(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, |v1|, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %max0 = call float @llvm.maximum.f32(float %a, float %b.fabs)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs2(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fabs)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, |v1|, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %max0 = call float @llvm.maximum.f32(float %a.fabs, float %b.fabs)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fabs)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -v0, -v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg float %a
+  %b.fneg = fneg float %b
+  %c.fneg = fneg float %c
+  %max0 = call float @llvm.maximum.f32(float %a.fneg, float %b.fneg)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -|v0|, -|v1|, -|v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %a.fneg.fabs = fneg float %a.fabs
+  %b.fneg.fabs = fneg float %b.fabs
+  %c.fneg.fabs = fneg float %c.fabs
+  %max0 = call float @llvm.maximum.f32(float %a.fneg.fabs, float %b.fneg.fabs)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg.fabs)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg0(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg float %a
+  %max0 = call float @llvm.maximum.f32(float %a.fneg, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg1(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, -v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %b.fneg = fneg float %b
+  %max0 = call float @llvm.maximum.f32(float %a, float %b.fneg)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg2(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %c.fneg = fneg float %c
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_const0(float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_const0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, 0x41000000, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float 8.0, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32__const2(float %a, float %b) {
+; GFX12-LABEL: v_fmaximum3_f32__const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, 0x41000000
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float 8.0)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_inlineimm0(float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_inlineimm0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, 4.0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float 4.0, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32__inlineimm(float %a, float %b) {
+; GFX12-LABEL: v_fmaximum3_f32__inlineimm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float 4.0)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_const1_const2(float %a) {
+; GFX12-LABEL: v_fmaximum3_f32_const1_const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s0, 0x41000000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, s0, 0x41800000
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float 8.0)
+  %max1 = call float @llvm.maximum.f32(float %max0, float 16.0)
+  ret float %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v4, v0, v2
+; GFX12-NEXT:    v_maximum3_f32 v1, v5, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %c, <2 x float> %max0)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v2, v4
+; GFX12-NEXT:    v_maximum3_f32 v1, v1, v3, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, |v2|, |v4|
+; GFX12-NEXT:    v_maximum3_f32 v1, |v1|, |v3|, |v5|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %b.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %b)
+  %c.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %c)
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a.fabs, <2 x float> %b.fabs)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c.fabs)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -v0, -v2, -v4
+; GFX12-NEXT:    v_maximum3_f32 v1, -v1, -v3, -v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <2 x float> %a
+  %b.fneg = fneg <2 x float> %b
+  %c.fneg = fneg <2 x float> %c
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a.fneg, <2 x float> %b.fneg)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c.fneg)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, 2.0, v2
+; GFX12-NEXT:    v_maximum3_f32 v1, v1, 2.0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> <float 2.0, float 2.0>)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v0, v4, 4.0
+; GFX12-NEXT:    v_maximum_f32 v1, v5, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/93025