[llvm-branch-commits] [llvm] AMDGPU: Make v2f16 minimum/maximum legal for gfx950 (PR #117738)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Nov 26 08:52:08 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
---
Patch is 186.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117738.diff
6 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+3)
- (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+11)
- (modified) llvm/test/CodeGen/AMDGPU/fmaximum3.ll (+820-641)
- (modified) llvm/test/CodeGen/AMDGPU/fminimum3.ll (+820-641)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll (+98-250)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll (+98-250)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a24b6430378cc9..a212a9218ca0db 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -859,6 +859,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
if (Subtarget->hasMinimum3Maximum3F32())
setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
+
+ if (Subtarget->hasMinimum3Maximum3PKF16())
+ setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
}
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index ae5a6581a3b200..7d202de6643bcb 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -255,6 +255,17 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
>;
}
+class MinimumMaximumByMinimum3Maximum3VOP3P<SDPatternOperator node,
+ Instruction inst> : GCNPat<
+ (v2f16 (node (VOP3PMods v2f16:$src0, i32:$src0_mods), (VOP3PMods v2f16:$src1, i32:$src1_mods))),
+ (inst $src0_mods, $src0, $src1_mods, $src1, $src1_mods, $src1)
+>;
+
+let SubtargetPredicate = HasMinimum3Maximum3PKF16 in {
+def : MinimumMaximumByMinimum3Maximum3VOP3P<fminimum, V_PK_MINIMUM3_F16>;
+def : MinimumMaximumByMinimum3Maximum3VOP3P<fmaximum, V_PK_MAXIMUM3_F16>;
+}
+
let SubtargetPredicate = HasMadMixInsts, OtherPredicates = [NoFP32Denormals] in {
// These are VOP3a-like opcodes which accept no omod.
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index e771e5801f2eda..f0fa621e3b4bc3 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -1772,30 +1772,38 @@ define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c
; GFX12-NEXT: v_pk_maximum_f16 v0, v2, v0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_v2f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_mov_b32 s0, 0x5040100
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0
-; GFX9-NEXT: v_pk_max_f16 v1, v2, v1
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v5
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_v2f16:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_pk_max_f16 v3, v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX940-NEXT: s_mov_b32 s0, 0x5040100
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT: v_perm_b32 v1, v0, v5, s0
+; GFX940-NEXT: v_pk_max_f16 v1, v2, v1
+; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v2, v5
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_v2f16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v2, v0, v0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
%max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max0)
ret <2 x half> %max1
@@ -1814,30 +1822,38 @@ define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_v2f16_commute:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_mov_b32 s0, 0x5040100
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0
-; GFX9-NEXT: v_pk_max_f16 v1, v1, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_v2f16_commute:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_pk_max_f16 v3, v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX940-NEXT: s_mov_b32 s0, 0x5040100
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT: v_perm_b32 v1, v0, v5, s0
+; GFX940-NEXT: v_pk_max_f16 v1, v1, v2
+; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v2
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_v2f16_commute:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
%max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c)
ret <2 x half> %max1
@@ -1859,32 +1875,43 @@ define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_v2f16__fabs_all:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0
-; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1
-; GFX9-NEXT: v_pk_max_f16 v3, v3, v4
-; GFX9-NEXT: v_mov_b32_e32 v6, 0x7e00
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: s_mov_b32 s0, 0x5040100
-; GFX9-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc
-; GFX9-NEXT: v_perm_b32 v1, v4, v0, s0
-; GFX9-NEXT: v_pk_max_f16 v1, v1, v5
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc
-; GFX9-NEXT: v_perm_b32 v0, v3, v0, s0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_v2f16__fabs_all:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0
+; GFX940-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1
+; GFX940-NEXT: v_pk_max_f16 v3, v3, v4
+; GFX940-NEXT: v_mov_b32_e32 v6, 0x7e00
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT: s_mov_b32 s0, 0x5040100
+; GFX940-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2
+; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc
+; GFX940-NEXT: v_perm_b32 v1, v4, v0, s0
+; GFX940-NEXT: v_pk_max_f16 v1, v1, v5
+; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc
+; GFX940-NEXT: v_perm_b32 v0, v3, v0, s0
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_v2f16__fabs_all:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
+; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
%b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b)
%c.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %c)
@@ -1906,30 +1933,38 @@ define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_v2f16__fneg_all:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1
-; GFX9-NEXT: s_mov_b32 s0, 0x5040100
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0
-; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v5, -v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_v2f16__fneg_all:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_pk_max_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX940-NEXT: s_mov_b32 s0, 0x5040100
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT: v_perm_b32 v1, v0, v5, s0
+; GFX940-NEXT: v_pk_max_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v5, -v2
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_v2f16__fneg_all:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 neg_lo:[1,1,1] neg_hi:[1,1,1]
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 neg_lo:[0,1,1] neg_hi:[0,1,1]
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fneg = fneg <2 x half> %a
%b.fneg = fneg <2 x half> %b
%c.fneg = fneg <2 x half> %c
@@ -1951,30 +1986,38 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) {
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_v2f16__inlineimm1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v2, v0, 2.0 op_sel_hi:[1,0]
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: s_mov_b32 s0, 0x5040100
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT: v_perm_b32 v2, v3, v0, s0
-; GFX9-NEXT: v_pk_max_f16 v2, v2, v1
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT: v_perm_b32 v0, v3, v0, s0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_v2f16__inlineimm1:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_pk_max_f16 v2, v0, 2.0 op_sel_hi:[1,0]
+; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT: s_mov_b32 s0, 0x5040100
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX940-NEXT: v_perm_b32 v2, v3, v0, s0
+; GFX940-NEXT: v_pk_max_f16 v2, v2, v1
+; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX940-NEXT: v_perm_b32 v0, v3, v0, s0
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_v2f16__inlineimm1:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0]
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 2.0>)
%max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c)
ret <2 x half> %max1
@@ -1993,30 +2036,38 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) {
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_v2f16__inlineimm2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v2, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_mov_b32 s0, 0x5040100
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT: v_perm_b32 v1, v0, v4, s0
-; GFX9-NEXT: v_pk_max_f16 v1, v1, 4.0 op_sel_hi:[1,0]
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v4
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT: v_perm_b32 v0, v0, v2, s0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_v2f16__inlineimm2:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_pk_max_f16 v2, v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00
+; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX940-NEXT: s_mov_b32 s0, 0x5040100
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT: v_perm_b32 v1, v0, v4, s0
+; GFX940-NEXT: v_pk_max_f16 v1, v1, 4.0 op_sel_hi:[1,0]
+; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v4
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX940-NEXT: v_perm_b32 v0, v0, v2, s0
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_v2f16__inlineimm2:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0]
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
%max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> <half 4.0, half 4.0>)
ret <2 x half> %max1
@@ -2037,42 +2088,51 @@ define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c
; GFX12-NEXT: v_pk_maximum_f16 v1, v5, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_v3f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v6, v0, v2
-; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT: s_mov_b32 s0, 0x5040100
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_pk_max_f16 v2, v1, v3
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc
-; GF...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/117738
More information about the llvm-branch-commits
mailing list