[llvm] AMDGPU: Widen f16 minimum/maximum to v2f16 on gfx950 (PR #128121)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 20 21:03:33 PST 2025
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/128121
>From 8b17104890d7c32d5be530537fa844e5dc868ec9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 23 May 2024 21:01:16 +0200
Subject: [PATCH] AMDGPU: Widen f16 minimum/maximum to v2f16 on gfx950
Unfortunately we only have the vector versions of v2f16 minimum3
and maximum. Widen to v2f16 so we can lower as minimum333(x, y, y).
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 40 +-
llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 +
llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 689 ++++++++++++-------
llvm/test/CodeGen/AMDGPU/fminimum3.ll | 689 ++++++++++++-------
llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 66 +-
llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 66 +-
6 files changed, 966 insertions(+), 585 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 909ad07782fc6..0b13a53a0c989 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -869,8 +869,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasMinimum3Maximum3F32())
setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
- if (Subtarget->hasMinimum3Maximum3PKF16())
+ if (Subtarget->hasMinimum3Maximum3PKF16()) {
setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
+
+ // If only the vector form is available, we need to widen to a vector.
+ if (!Subtarget->hasMinimum3Maximum3F16())
+ setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
+ }
}
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
@@ -5964,6 +5969,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMINNUM:
case ISD::FMAXNUM:
return lowerFMINNUM_FMAXNUM(Op, DAG);
+ case ISD::FMINIMUM:
+ case ISD::FMAXIMUM:
+ return lowerFMINIMUM_FMAXIMUM(Op, DAG);
case ISD::FLDEXP:
case ISD::STRICT_FLDEXP:
return lowerFLDEXP(Op, DAG);
@@ -5985,8 +5993,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMUL:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
- case ISD::FMINIMUM:
- case ISD::FMAXIMUM:
case ISD::FMINIMUMNUM:
case ISD::FMAXIMUMNUM:
case ISD::UADDSAT:
@@ -6841,6 +6847,34 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
return Op;
}
+SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ if (VT.isVector())
+ return splitBinaryVectorOp(Op, DAG);
+
+ assert(!Subtarget->hasIEEEMinMax() && !Subtarget->hasMinimum3Maximum3F16() &&
+ Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
+ "should not need to widen f16 minimum/maximum to v2f16");
+
+ // Widen f16 operation to v2f16
+
+ // fminimum f16:x, f16:y ->
+ // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
+ // (v2f16 (scalar_to_vector y))), 0
+ SDLoc SL(Op);
+ SDValue WideSrc0 =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
+ SDValue WideSrc1 =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
+
+ SDValue Widened =
+ DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
+ DAG.getConstant(0, SL, MVT::i32));
+}
+
SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
EVT VT = Op.getValueType();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 1cd7f1b29e077..9b2c14862407a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -146,6 +146,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
/// Custom lowering for ISD::FP_ROUND for MVT::f16.
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 66de7d535db4b..f228824ff750e 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -1252,19 +1252,27 @@ define half @v_fmaximum3_f16(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e32 v3, v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, v0, v2
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
ret half %max1
@@ -1281,19 +1289,27 @@ define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, v2, v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_commute:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v2, v0
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_commute:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e32 v3, v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, v2, v0
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v2, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_commute:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v2, v0, v0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %c, half %max0)
ret half %max1
@@ -1311,22 +1327,34 @@ define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg %
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: ; return to shader part epilog
;
-; GFX9-LABEL: s_fmaximum3_f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_max_f16_e32 v1, s0, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, s2, v0
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: ; return to shader part epilog
+; GFX942-LABEL: s_fmaximum3_f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NEXT: v_max_f16_e32 v1, s0, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, s2, v0
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s2, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_readfirstlane_b32 s0, v0
+; GFX942-NEXT: ; return to shader part epilog
+;
+; GFX950-LABEL: s_fmaximum3_f16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: v_mov_b32_e32 v0, s0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s1, s1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s2, s2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
+; GFX950-NEXT: ; return to shader part epilog
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
%cast = bitcast half %max1 to i16
@@ -1346,19 +1374,28 @@ define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, |v0|, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fabs0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e64 v3, |v0|, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fabs0:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e64 v3, |v0|, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, v0, v2
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fabs0:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call half @llvm.fabs.f16(half %a)
%max0 = call half @llvm.maximum.f16(half %a.fabs, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -1376,19 +1413,28 @@ define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, v0, |v1|, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fabs1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e64 v3, v0, |v1|
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v1|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fabs1:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e64 v3, v0, |v1|
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v1|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, v0, v2
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fabs1:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%b.fabs = call half @llvm.fabs.f16(half %b)
%max0 = call half @llvm.maximum.f16(half %a, half %b.fabs)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -1406,19 +1452,28 @@ define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, |v2|
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fabs2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e64 v1, v0, |v2|
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fabs2:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e32 v3, v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e64 v1, v0, |v2|
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fabs2:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%c.fabs = call half @llvm.fabs.f16(half %c)
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c.fabs)
@@ -1436,19 +1491,30 @@ define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, |v0|, |v1|, |v2|
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fabs_all:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e64 v3, |v0|, |v1|
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e64 v1, v0, |v2|
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fabs_all:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e64 v3, |v0|, |v1|
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e64 v1, v0, |v2|
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fabs_all:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call half @llvm.fabs.f16(half %a)
%b.fabs = call half @llvm.fabs.f16(half %b)
%c.fabs = call half @llvm.fabs.f16(half %c)
@@ -1468,19 +1534,30 @@ define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, -v0, -v1, -v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fneg_all:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e64 v3, -v0, -v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e64 v1, v0, -v2
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fneg_all:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e64 v3, -v0, -v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e64 v1, v0, -v2
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fneg_all:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fneg = fneg half %a
%b.fneg = fneg half %b
%c.fneg = fneg half %c
@@ -1500,19 +1577,30 @@ define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, -|v0|, -|v1|, -|v2|
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fneg_fabs_all:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e64 v3, -|v0|, -|v1|
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -|v0|, -|v1|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e64 v1, v0, -|v2|
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -|v2|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fneg_fabs_all:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e64 v3, -|v0|, -|v1|
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -|v0|, -|v1|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e64 v1, v0, -|v2|
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -|v2|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fneg_fabs_all:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_or_b32_e32 v0, 0x8000, v0
+; GFX950-NEXT: v_or_b32_e32 v1, 0x8000, v1
+; GFX950-NEXT: v_or_b32_e32 v2, 0x8000, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call half @llvm.fabs.f16(half %a)
%b.fabs = call half @llvm.fabs.f16(half %b)
%c.fabs = call half @llvm.fabs.f16(half %c)
@@ -1535,19 +1623,28 @@ define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, -v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fneg0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e64 v3, -v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fneg0:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e64 v3, -v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, v0, v2
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fneg0:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fneg = fneg half %a
%max0 = call half @llvm.maximum.f16(half %a.fneg, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -1565,19 +1662,28 @@ define half @v_fmaximum3_f16_fneg1(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, v0, -v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fneg1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e64 v3, v0, -v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fneg1:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e64 v3, v0, -v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, v0, v2
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fneg1:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%b.fneg = fneg half %b
%max0 = call half @llvm.maximum.f16(half %a, half %b.fneg)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -1595,19 +1701,28 @@ define half @v_fmaximum3_f16_fneg2(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, -v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fneg2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e64 v1, v0, -v2
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fneg2:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e32 v3, v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e64 v1, v0, -v2
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fneg2:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%c.fneg = fneg half %c
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c.fneg)
@@ -1625,19 +1740,28 @@ define half @v_fmaximum3_f16_const0(half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, v0, 0x4800, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_const0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v2, 0x4800, v0
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT: v_max_f16_e32 v2, v0, v1
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_const0:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e32 v2, 0x4800, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT: v_max_f16_e32 v2, v0, v1
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_const0:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: s_movk_i32 s0, 0x4800
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.maximum.f16(half 8.0, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
ret half %max1
@@ -1654,19 +1778,27 @@ define half @v_fmaximum3_f16__const2(half %a, half %b) {
; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, 0x4800
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16__const2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v2, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, 0x4800, v0
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16__const2:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e32 v2, v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, 0x4800, v0
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16__const2:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_movk_i32 s0, 0x4800
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half 8.0)
ret half %max1
@@ -1683,19 +1815,27 @@ define half @v_fmaximum3_f16_inlineimm0(half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, v0, 4.0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_inlineimm0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v2, 4.0, v0
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT: v_max_f16_e32 v2, v0, v1
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_inlineimm0:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e32 v2, 4.0, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT: v_max_f16_e32 v2, v0, v1
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_inlineimm0:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, 4.0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.maximum.f16(half 4.0, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
ret half %max1
@@ -1712,19 +1852,27 @@ define half @v_fmaximum3_f16__inlineimm(half %a, half %b) {
; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, 4.0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16__inlineimm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v2, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, 4.0, v0
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16__inlineimm:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e32 v2, v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, 4.0, v0
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16__inlineimm:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, 4.0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half 4.0)
ret half %max1
@@ -1743,19 +1891,28 @@ define half @v_fmaximum3_f16_const1_const2(half %a) {
; GFX12-NEXT: v_maximum3_f16 v0, v0, s0, 0x4c00
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_const1_const2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v1, 0x4800, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, 0x4c00, v0
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_const1_const2:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e32 v1, 0x4800, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, 0x4c00, v0
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_const1_const2:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: s_movk_i32 s0, 0x4800
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0
+; GFX950-NEXT: s_movk_i32 s0, 0x4c00
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.maximum.f16(half %a, half 8.0)
%max1 = call half @llvm.maximum.f16(half %max0, half 16.0)
ret half %max1
@@ -3623,20 +3780,30 @@ define <2 x half> @v_no_fmaximum3_f16__multi_use(half %a, half %b, half %c) {
; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_no_fmaximum3_f16__multi_use:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_no_fmaximum3_f16__multi_use:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e32 v3, v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, v0, v2
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX942-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_no_fmaximum3_f16__multi_use:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v1, v0, v2, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
%insert.0 = insertelement <2 x half> poison, half %max0, i32 0
@@ -3654,23 +3821,35 @@ define amdgpu_ps <2 x i32> @s_no_fmaximum3_f16__multi_use(half inreg %a, half in
; GFX12-NEXT: s_and_b32 s1, 0xffff, s1
; GFX12-NEXT: ; return to shader part epilog
;
-; GFX9-LABEL: s_no_fmaximum3_f16__multi_use:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_max_f16_e32 v1, s0, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, s2, v0
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
-; GFX9-NEXT: ; return to shader part epilog
+; GFX942-LABEL: s_no_fmaximum3_f16__multi_use:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NEXT: v_max_f16_e32 v1, s0, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, s2, v0
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s2, v0
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX942-NEXT: v_readfirstlane_b32 s0, v0
+; GFX942-NEXT: v_readfirstlane_b32 s1, v1
+; GFX942-NEXT: ; return to shader part epilog
+;
+; GFX950-LABEL: s_no_fmaximum3_f16__multi_use:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: v_mov_b32_e32 v0, s0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s1, s1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v1, v0, s2, s2
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX950-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
+; GFX950-NEXT: v_readfirstlane_b32 s1, v1
+; GFX950-NEXT: ; return to shader part epilog
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
%cast0 = bitcast half %max0 to i16
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index 56e0b2c2f06ce..8ba73071d9adb 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -1252,19 +1252,27 @@ define half @v_fminimum3_f16(half %a, half %b, half %c) {
; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fminimum3_f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_e32 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_min_f16_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_min_f16_e32 v3, v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_min_f16_e32 v1, v0, v2
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.minimum.f16(half %a, half %b)
%max1 = call half @llvm.minimum.f16(half %max0, half %c)
ret half %max1
@@ -1281,19 +1289,27 @@ define half @v_fminimum3_f16_commute(half %a, half %b, half %c) {
; GFX12-NEXT: v_minimum3_f16 v0, v2, v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fminimum3_f16_commute:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_e32 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_min_f16_e32 v1, v2, v0
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_commute:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_min_f16_e32 v3, v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_min_f16_e32 v1, v2, v0
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v2, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_commute:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v2, v0, v0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.minimum.f16(half %a, half %b)
%max1 = call half @llvm.minimum.f16(half %c, half %max0)
ret half %max1
@@ -1311,22 +1327,34 @@ define amdgpu_ps i32 @s_fminimum3_f16(half inreg %a, half inreg %b, half inreg %
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: ; return to shader part epilog
;
-; GFX9-LABEL: s_fminimum3_f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_min_f16_e32 v1, s0, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_min_f16_e32 v1, s2, v0
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: ; return to shader part epilog
+; GFX942-LABEL: s_fminimum3_f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NEXT: v_min_f16_e32 v1, s0, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT: v_min_f16_e32 v1, s2, v0
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s2, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_readfirstlane_b32 s0, v0
+; GFX942-NEXT: ; return to shader part epilog
+;
+; GFX950-LABEL: s_fminimum3_f16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: v_mov_b32_e32 v0, s0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s1, s1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s2, s2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
+; GFX950-NEXT: ; return to shader part epilog
%max0 = call half @llvm.minimum.f16(half %a, half %b)
%max1 = call half @llvm.minimum.f16(half %max0, half %c)
%cast = bitcast half %max1 to i16
@@ -1346,19 +1374,28 @@ define half @v_fminimum3_f16_fabs0(half %a, half %b, half %c) {
; GFX12-NEXT: v_minimum3_f16 v0, |v0|, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fminimum3_f16_fabs0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_e64 v3, |v0|, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_min_f16_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_fabs0:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_min_f16_e64 v3, |v0|, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_min_f16_e32 v1, v0, v2
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_fabs0:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call half @llvm.fabs.f16(half %a)
%max0 = call half @llvm.minimum.f16(half %a.fabs, half %b)
%max1 = call half @llvm.minimum.f16(half %max0, half %c)
@@ -1376,19 +1413,28 @@ define half @v_fminimum3_f16_fabs1(half %a, half %b, half %c) {
; GFX12-NEXT: v_minimum3_f16 v0, v0, |v1|, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fminimum3_f16_fabs1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_e64 v3, v0, |v1|
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v1|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_min_f16_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_fabs1:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_min_f16_e64 v3, v0, |v1|
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v1|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_min_f16_e32 v1, v0, v2
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_fabs1:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%b.fabs = call half @llvm.fabs.f16(half %b)
%max0 = call half @llvm.minimum.f16(half %a, half %b.fabs)
%max1 = call half @llvm.minimum.f16(half %max0, half %c)
@@ -1406,19 +1452,28 @@ define half @v_fminimum3_f16_fabs2(half %a, half %b, half %c) {
; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, |v2|
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fminimum3_f16_fabs2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_e32 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_min_f16_e64 v1, v0, |v2|
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_fabs2:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_min_f16_e32 v3, v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_min_f16_e64 v1, v0, |v2|
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_fabs2:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%c.fabs = call half @llvm.fabs.f16(half %c)
%max0 = call half @llvm.minimum.f16(half %a, half %b)
%max1 = call half @llvm.minimum.f16(half %max0, half %c.fabs)
@@ -1436,19 +1491,30 @@ define half @v_fminimum3_f16_fabs_all(half %a, half %b, half %c) {
; GFX12-NEXT: v_minimum3_f16 v0, |v0|, |v1|, |v2|
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fminimum3_f16_fabs_all:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_e64 v3, |v0|, |v1|
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_min_f16_e64 v1, v0, |v2|
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_fabs_all:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_min_f16_e64 v3, |v0|, |v1|
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_min_f16_e64 v1, v0, |v2|
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_fabs_all:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call half @llvm.fabs.f16(half %a)
%b.fabs = call half @llvm.fabs.f16(half %b)
%c.fabs = call half @llvm.fabs.f16(half %c)
@@ -1468,19 +1534,30 @@ define half @v_fminimum3_f16_fneg_all(half %a, half %b, half %c) {
; GFX12-NEXT: v_minimum3_f16 v0, -v0, -v1, -v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fminimum3_f16_fneg_all:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_e64 v3, -v0, -v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_min_f16_e64 v1, v0, -v2
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_fneg_all:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_min_f16_e64 v3, -v0, -v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_min_f16_e64 v1, v0, -v2
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_fneg_all:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fneg = fneg half %a
%b.fneg = fneg half %b
%c.fneg = fneg half %c
@@ -1500,19 +1577,30 @@ define half @v_fminimum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
; GFX12-NEXT: v_minimum3_f16 v0, -|v0|, -|v1|, -|v2|
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fminimum3_f16_fneg_fabs_all:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_e64 v3, -|v0|, -|v1|
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -|v0|, -|v1|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_min_f16_e64 v1, v0, -|v2|
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -|v2|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_fneg_fabs_all:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_min_f16_e64 v3, -|v0|, -|v1|
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -|v0|, -|v1|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_min_f16_e64 v1, v0, -|v2|
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -|v2|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_fneg_fabs_all:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_or_b32_e32 v0, 0x8000, v0
+; GFX950-NEXT: v_or_b32_e32 v1, 0x8000, v1
+; GFX950-NEXT: v_or_b32_e32 v2, 0x8000, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call half @llvm.fabs.f16(half %a)
%b.fabs = call half @llvm.fabs.f16(half %b)
%c.fabs = call half @llvm.fabs.f16(half %c)
@@ -1535,19 +1623,28 @@ define half @v_fminimum3_f16_fneg0(half %a, half %b, half %c) {
; GFX12-NEXT: v_minimum3_f16 v0, -v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fminimum3_f16_fneg0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_e64 v3, -v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_min_f16_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_fneg0:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_min_f16_e64 v3, -v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_min_f16_e32 v1, v0, v2
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_fneg0:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fneg = fneg half %a
%max0 = call half @llvm.minimum.f16(half %a.fneg, half %b)
%max1 = call half @llvm.minimum.f16(half %max0, half %c)
@@ -1565,19 +1662,28 @@ define half @v_fminimum3_f16_fneg1(half %a, half %b, half %c) {
; GFX12-NEXT: v_minimum3_f16 v0, v0, -v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fminimum3_f16_fneg1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_e64 v3, v0, -v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_min_f16_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_fneg1:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_min_f16_e64 v3, v0, -v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_min_f16_e32 v1, v0, v2
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_fneg1:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%b.fneg = fneg half %b
%max0 = call half @llvm.minimum.f16(half %a, half %b.fneg)
%max1 = call half @llvm.minimum.f16(half %max0, half %c)
@@ -1595,19 +1701,28 @@ define half @v_fminimum3_f16_fneg2(half %a, half %b, half %c) {
; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, -v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fminimum3_f16_fneg2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_e32 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_min_f16_e64 v1, v0, -v2
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_fneg2:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_min_f16_e32 v3, v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_min_f16_e64 v1, v0, -v2
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_fneg2:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%c.fneg = fneg half %c
%max0 = call half @llvm.minimum.f16(half %a, half %b)
%max1 = call half @llvm.minimum.f16(half %max0, half %c.fneg)
@@ -1625,19 +1740,28 @@ define half @v_fminimum3_f16_const0(half %b, half %c) {
; GFX12-NEXT: v_minimum3_f16 v0, v0, 0x4800, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fminimum3_f16_const0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_e32 v2, 0x4800, v0
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT: v_min_f16_e32 v2, v0, v1
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_const0:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_min_f16_e32 v2, 0x4800, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT: v_min_f16_e32 v2, v0, v1
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_const0:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: s_movk_i32 s0, 0x4800
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.minimum.f16(half 8.0, half %b)
%max1 = call half @llvm.minimum.f16(half %max0, half %c)
ret half %max1
@@ -1654,19 +1778,27 @@ define half @v_fminimum3_f16__const2(half %a, half %b) {
; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, 0x4800
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fminimum3_f16__const2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_e32 v2, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT: v_min_f16_e32 v1, 0x4800, v0
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16__const2:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_min_f16_e32 v2, v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT: v_min_f16_e32 v1, 0x4800, v0
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16__const2:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_movk_i32 s0, 0x4800
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.minimum.f16(half %a, half %b)
%max1 = call half @llvm.minimum.f16(half %max0, half 8.0)
ret half %max1
@@ -1683,19 +1815,27 @@ define half @v_fminimum3_f16_inlineimm0(half %b, half %c) {
; GFX12-NEXT: v_minimum3_f16 v0, v0, 4.0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fminimum3_f16_inlineimm0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_e32 v2, 4.0, v0
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT: v_min_f16_e32 v2, v0, v1
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_inlineimm0:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_min_f16_e32 v2, 4.0, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT: v_min_f16_e32 v2, v0, v1
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_inlineimm0:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, 4.0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.minimum.f16(half 4.0, half %b)
%max1 = call half @llvm.minimum.f16(half %max0, half %c)
ret half %max1
@@ -1712,19 +1852,27 @@ define half @v_fminimum3_f16__inlineimm(half %a, half %b) {
; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, 4.0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fminimum3_f16__inlineimm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_e32 v2, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT: v_min_f16_e32 v1, 4.0, v0
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16__inlineimm:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_min_f16_e32 v2, v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT: v_min_f16_e32 v1, 4.0, v0
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16__inlineimm:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, 4.0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.minimum.f16(half %a, half %b)
%max1 = call half @llvm.minimum.f16(half %max0, half 4.0)
ret half %max1
@@ -1743,19 +1891,28 @@ define half @v_fminimum3_f16_const1_const2(half %a) {
; GFX12-NEXT: v_minimum3_f16 v0, v0, s0, 0x4c00
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fminimum3_f16_const1_const2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_e32 v1, 0x4800, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_min_f16_e32 v1, 0x4c00, v0
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fminimum3_f16_const1_const2:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_min_f16_e32 v1, 0x4800, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT: v_min_f16_e32 v1, 0x4c00, v0
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f16_const1_const2:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: s_movk_i32 s0, 0x4800
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0
+; GFX950-NEXT: s_movk_i32 s0, 0x4c00
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.minimum.f16(half %a, half 8.0)
%max1 = call half @llvm.minimum.f16(half %max0, half 16.0)
ret half %max1
@@ -3623,20 +3780,30 @@ define <2 x half> @v_no_fminimum3_f16__multi_use(half %a, half %b, half %c) {
; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_no_fminimum3_f16__multi_use:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_e32 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_min_f16_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_no_fminimum3_f16__multi_use:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_min_f16_e32 v3, v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_min_f16_e32 v1, v0, v2
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX942-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_no_fminimum3_f16__multi_use:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_minimum3_f16 v1, v0, v2, v2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.minimum.f16(half %a, half %b)
%max1 = call half @llvm.minimum.f16(half %max0, half %c)
%insert.0 = insertelement <2 x half> poison, half %max0, i32 0
@@ -3654,23 +3821,35 @@ define amdgpu_ps <2 x i32> @s_no_fminimum3_f16__multi_use(half inreg %a, half in
; GFX12-NEXT: s_and_b32 s1, 0xffff, s1
; GFX12-NEXT: ; return to shader part epilog
;
-; GFX9-LABEL: s_no_fminimum3_f16__multi_use:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_min_f16_e32 v1, s0, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_min_f16_e32 v1, s2, v0
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
-; GFX9-NEXT: ; return to shader part epilog
+; GFX942-LABEL: s_no_fminimum3_f16__multi_use:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NEXT: v_min_f16_e32 v1, s0, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT: v_min_f16_e32 v1, s2, v0
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s2, v0
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX942-NEXT: v_readfirstlane_b32 s0, v0
+; GFX942-NEXT: v_readfirstlane_b32 s1, v1
+; GFX942-NEXT: ; return to shader part epilog
+;
+; GFX950-LABEL: s_no_fminimum3_f16__multi_use:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: v_mov_b32_e32 v0, s0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s1, s1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_minimum3_f16 v1, v0, s2, s2
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX950-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
+; GFX950-NEXT: v_readfirstlane_b32 s1, v1
+; GFX950-NEXT: ; return to shader part epilog
%max0 = call half @llvm.minimum.f16(half %a, half %b)
%max1 = call half @llvm.minimum.f16(half %max0, half %c)
%cast0 = bitcast half %max0 to i16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index 4532571d5cf2a..e828a12442fb8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -42,11 +42,7 @@ define half @v_maximum_f16(half %src0, half %src1) {
; GFX950-LABEL: v_maximum_f16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_max_f16_e32 v2, v0, v1
-; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00
-; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_f16:
@@ -96,11 +92,17 @@ define half @v_maximum_f16__nnan(half %src0, half %src1) {
; GFX8-NEXT: v_max_f16_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_maximum_f16__nnan:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_f16__nnan:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_f16__nnan:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_f16__nnan:
; GFX10: ; %bb.0:
@@ -162,11 +164,7 @@ define half @v_maximum_f16__nsz(half %src0, half %src1) {
; GFX950-LABEL: v_maximum_f16__nsz:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_max_f16_e32 v2, v0, v1
-; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00
-; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_f16__nsz:
@@ -216,11 +214,17 @@ define half @v_maximum_f16__nnan_nsz(half %src0, half %src1) {
; GFX8-NEXT: v_max_f16_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_maximum_f16__nnan_nsz:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_f16__nnan_nsz:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_f16__nnan_nsz:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_f16__nnan_nsz:
; GFX10: ; %bb.0:
@@ -286,11 +290,7 @@ define half @v_maximum_f16__nnan_src0(half %arg0, half %src1) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_add_f16_e32 v0, 1.0, v0
-; GFX950-NEXT: v_max_f16_e32 v2, v0, v1
-; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00
-; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_f16__nnan_src0:
@@ -367,11 +367,7 @@ define half @v_maximum_f16__nnan_src1(half %src0, half %arg1) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_add_f16_e32 v1, 1.0, v1
-; GFX950-NEXT: v_max_f16_e32 v2, v0, v1
-; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00
-; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_maximum_f16__nnan_src1:
@@ -458,12 +454,9 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX950-LABEL: s_maximum_f16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v0, s1
-; GFX950-NEXT: v_max_f16_e32 v1, s0, v0
-; GFX950-NEXT: v_mov_b32_e32 v2, 0x7e00
-; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT: v_mov_b32_e32 v0, s0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s1, s1
+; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
@@ -2505,3 +2498,4 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) {
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
+; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index 0b9cb9682ea5f..9a2ef15737308 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -30,11 +30,7 @@ define half @v_minimum_f16(half %src0, half %src1) {
; GFX950-LABEL: v_minimum_f16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_min_f16_e32 v2, v0, v1
-; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00
-; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minimum_f16:
@@ -74,11 +70,17 @@ define half @v_minimum_f16__nnan(half %src0, half %src1) {
; GFX8-NEXT: v_min_f16_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_minimum_f16__nnan:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_f16__nnan:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_f16__nnan:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minimum_f16__nnan:
; GFX10: ; %bb.0:
@@ -127,11 +129,7 @@ define half @v_minimum_f16__nsz(half %src0, half %src1) {
; GFX950-LABEL: v_minimum_f16__nsz:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_min_f16_e32 v2, v0, v1
-; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00
-; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minimum_f16__nsz:
@@ -171,11 +169,17 @@ define half @v_minimum_f16__nnan_nsz(half %src0, half %src1) {
; GFX8-NEXT: v_min_f16_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_minimum_f16__nnan_nsz:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_min_f16_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_f16__nnan_nsz:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_f16__nnan_nsz:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minimum_f16__nnan_nsz:
; GFX10: ; %bb.0:
@@ -227,11 +231,7 @@ define half @v_minimum_f16__nnan_src0(half %arg0, half %src1) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_add_f16_e32 v0, 1.0, v0
-; GFX950-NEXT: v_min_f16_e32 v2, v0, v1
-; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00
-; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minimum_f16__nnan_src0:
@@ -294,11 +294,7 @@ define half @v_minimum_f16__nnan_src1(half %src0, half %arg1) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_add_f16_e32 v1, 1.0, v1
-; GFX950-NEXT: v_min_f16_e32 v2, v0, v1
-; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00
-; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_minimum_f16__nnan_src1:
@@ -368,12 +364,9 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX950-LABEL: s_minimum_f16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v0, s1
-; GFX950-NEXT: v_min_f16_e32 v1, s0, v0
-; GFX950-NEXT: v_mov_b32_e32 v2, 0x7e00
-; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT: v_mov_b32_e32 v0, s0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s1, s1
+; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; use v0
@@ -1924,3 +1917,4 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) {
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
+; GFX9: {{.*}}
More information about the llvm-commits
mailing list