[llvm] AMDGPU: Form v2f16 minimum3/maximum3 on gfx950 (PR #128123)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 20 21:09:08 PST 2025
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/128123
>From 3263aaf2d7c4223a957d68ec2adc78d107044bdd Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 28 May 2024 12:59:41 +0200
Subject: [PATCH] AMDGPU: Form v2f16 minimum3/maximum3 on gfx950
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 3 +-
llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 4 +-
llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 175 ++++++--------------
llvm/test/CodeGen/AMDGPU/fminimum3.ll | 175 ++++++--------------
4 files changed, 112 insertions(+), 245 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0b13a53a0c989..6ed09253c51e1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13515,7 +13515,8 @@ static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
- (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16());
+ (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
+ (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
case ISD::SMAX:
case ISD::SMIN:
case ISD::UMAX:
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index d5c6e8af109f4..85c047167f1e1 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -145,8 +145,8 @@ def : VOP3PSatPat<ssubsat, V_PK_SUB_I16>;
} // End SubtargetPredicate = HasVOP3PInsts
let SubtargetPredicate = HasMinimum3Maximum3PKF16, FPDPRounding = 1 in {
-defm V_PK_MINIMUM3_F16 : VOP3PInst<"v_pk_minimum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>;
-defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>;
+defm V_PK_MINIMUM3_F16 : VOP3PInst<"v_pk_minimum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfminimum3>;
+defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, AMDGPUfmaximum3>;
}
// TODO: Make sure we're doing the right thing with denormals. Note
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index f228824ff750e..2a372dffce650 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -1269,9 +1269,7 @@ define half @v_fmaximum3_f16(half %a, half %b, half %c) {
; GFX950-LABEL: v_fmaximum3_f16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -1306,9 +1304,7 @@ define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) {
; GFX950-LABEL: v_fmaximum3_f16_commute:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v2, v0, v0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v2, v0, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %c, half %max0)
@@ -1346,10 +1342,9 @@ define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg %
;
; GFX950-LABEL: s_fmaximum3_f16:
; GFX950: ; %bb.0:
-; GFX950-NEXT: v_mov_b32_e32 v0, s0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s1, s1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s2, s2
+; GFX950-NEXT: v_mov_b32_e32 v0, s1
+; GFX950-NEXT: v_mov_b32_e32 v1, s2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, s0, v0, v1
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX950-NEXT: s_nop 0
@@ -1392,9 +1387,7 @@ define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call half @llvm.fabs.f16(half %a)
%max0 = call half @llvm.maximum.f16(half %a.fabs, half %b)
@@ -1431,9 +1424,7 @@ define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%b.fabs = call half @llvm.fabs.f16(half %b)
%max0 = call half @llvm.maximum.f16(half %a, half %b.fabs)
@@ -1470,9 +1461,7 @@ define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%c.fabs = call half @llvm.fabs.f16(half %c)
%max0 = call half @llvm.maximum.f16(half %a, half %b)
@@ -1511,9 +1500,7 @@ define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) {
; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1
; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call half @llvm.fabs.f16(half %a)
%b.fabs = call half @llvm.fabs.f16(half %b)
@@ -1554,9 +1541,7 @@ define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) {
; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fneg = fneg half %a
%b.fneg = fneg half %b
@@ -1597,9 +1582,7 @@ define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
; GFX950-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX950-NEXT: v_or_b32_e32 v1, 0x8000, v1
; GFX950-NEXT: v_or_b32_e32 v2, 0x8000, v2
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call half @llvm.fabs.f16(half %a)
%b.fabs = call half @llvm.fabs.f16(half %b)
@@ -1641,9 +1624,7 @@ define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fneg = fneg half %a
%max0 = call half @llvm.maximum.f16(half %a.fneg, half %b)
@@ -1680,9 +1661,7 @@ define half @v_fmaximum3_f16_fneg1(half %a, half %b, half %c) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%b.fneg = fneg half %b
%max0 = call half @llvm.maximum.f16(half %a, half %b.fneg)
@@ -1719,9 +1698,7 @@ define half @v_fmaximum3_f16_fneg2(half %a, half %b, half %c) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%c.fneg = fneg half %c
%max0 = call half @llvm.maximum.f16(half %a, half %b)
@@ -1758,9 +1735,7 @@ define half @v_fmaximum3_f16_const0(half %b, half %c) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_movk_i32 s0, 0x4800
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.maximum.f16(half 8.0, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -1795,9 +1770,8 @@ define half @v_fmaximum3_f16__const2(half %a, half %b) {
; GFX950-LABEL: v_fmaximum3_f16__const2:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
; GFX950-NEXT: s_movk_i32 s0, 0x4800
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, s0
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half 8.0)
@@ -1832,9 +1806,7 @@ define half @v_fmaximum3_f16_inlineimm0(half %b, half %c) {
; GFX950-LABEL: v_fmaximum3_f16_inlineimm0:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, 4.0
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.maximum.f16(half 4.0, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -1869,9 +1841,7 @@ define half @v_fmaximum3_f16__inlineimm(half %a, half %b) {
; GFX950-LABEL: v_fmaximum3_f16__inlineimm:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, 4.0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, 4.0
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half 4.0)
@@ -1909,9 +1879,8 @@ define half @v_fmaximum3_f16_const1_const2(half %a) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_movk_i32 s0, 0x4800
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0
-; GFX950-NEXT: s_movk_i32 s0, 0x4c00
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, s0
+; GFX950-NEXT: v_mov_b32_e32 v1, 0x4c00
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s0, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.maximum.f16(half %a, half 8.0)
%max1 = call half @llvm.maximum.f16(half %max0, half 16.0)
@@ -1959,9 +1928,7 @@ define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c
; GFX950-LABEL: v_fmaximum3_v2f16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v2, v0, v0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v2, v0, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
%max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max0)
@@ -2009,9 +1976,7 @@ define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x
; GFX950-LABEL: v_fmaximum3_v2f16_commute:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
%max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c)
@@ -2067,9 +2032,7 @@ define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2
; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
%b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b)
@@ -2120,9 +2083,7 @@ define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2
; GFX950-LABEL: v_fmaximum3_v2f16__fneg_all:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 neg_lo:[1,1,1] neg_hi:[1,1,1]
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 neg_lo:[0,1,1] neg_hi:[0,1,1]
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v2 neg_lo:[1,1,1] neg_hi:[1,1,1]
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fneg = fneg <2 x half> %a
%b.fneg = fneg <2 x half> %b
@@ -2173,9 +2134,7 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) {
; GFX950-LABEL: v_fmaximum3_v2f16__inlineimm1:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0]
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1]
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 2.0>)
%max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c)
@@ -2223,9 +2182,7 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) {
; GFX950-LABEL: v_fmaximum3_v2f16__inlineimm2:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0]
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, 4.0 op_sel_hi:[1,1,0]
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
%max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> <half 4.0, half 4.0>)
@@ -2287,10 +2244,8 @@ define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c
; GFX950-LABEL: v_fmaximum3_v3f16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v5, v1, v1
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v4, v0, v0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v4, v0, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v1, v5, v1, v3
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
%max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %c, <3 x half> %max0)
@@ -2352,10 +2307,8 @@ define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x
; GFX950-LABEL: v_fmaximum3_v3f16_commute:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v5, v5
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v4, v4
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v4
+; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v5
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
%max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c)
@@ -2428,16 +2381,14 @@ define <3 x half> @v_fmaximum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3
; GFX950-LABEL: v_fmaximum3_v3f16__fabs_all:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
-; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX950-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3
+; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
; GFX950-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v5
; GFX950-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v4
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v5, v5
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v4, v4
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v4
+; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v5
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a)
%b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b)
@@ -2502,10 +2453,8 @@ define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3
; GFX950-LABEL: v_fmaximum3_v3f16__fneg_all:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 neg_lo:[1,1,1] neg_hi:[1,1,1]
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 neg_lo:[1,1,1] neg_hi:[1,1,1]
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v5, v5 neg_lo:[0,1,1] neg_hi:[0,1,1]
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v4, v4 neg_lo:[0,1,1] neg_hi:[0,1,1]
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v4 neg_lo:[1,1,1] neg_hi:[1,1,1]
+; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v5 neg_lo:[1,1,1] neg_hi:[1,1,1]
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fneg = fneg <3 x half> %a
%b.fneg = fneg <3 x half> %b
@@ -2567,10 +2516,8 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
; GFX950-LABEL: v_fmaximum3_v3f16__inlineimm1:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, 2.0, 2.0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0]
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 2.0, v2 op_sel_hi:[1,0,1]
+; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, 2.0, v3
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> <half 2.0, half 2.0, half 2.0>)
%max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c)
@@ -2632,10 +2579,8 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) {
; GFX950-LABEL: v_fmaximum3_v3f16__inlineimm2:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, 4.0, 4.0
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0]
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, 4.0 op_sel_hi:[1,1,0]
+; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, 4.0
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
%max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> <half 4.0, half 4.0, half 4.0>)
@@ -2703,10 +2648,8 @@ define <4 x half> @v_fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c
; GFX950-LABEL: v_fmaximum3_v4f16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v5, v1, v1
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v4, v0, v0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v4, v0, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v1, v5, v1, v3
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
%max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %c, <4 x half> %max0)
@@ -2774,10 +2717,8 @@ define <4 x half> @v_fmaximum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x
; GFX950-LABEL: v_fmaximum3_v4f16_commute:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v5, v5
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v4, v4
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v4
+; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v5
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
%max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c)
@@ -2856,16 +2797,14 @@ define <4 x half> @v_fmaximum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4
; GFX950-LABEL: v_fmaximum3_v4f16__fabs_all:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
-; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX950-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3
+; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
; GFX950-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v5
; GFX950-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v4
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v5, v5
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v4, v4
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v4
+; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v5
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a)
%b.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %b)
@@ -2936,10 +2875,8 @@ define <4 x half> @v_fmaximum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4
; GFX950-LABEL: v_fmaximum3_v4f16__fneg_all:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 neg_lo:[1,1,1] neg_hi:[1,1,1]
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 neg_lo:[1,1,1] neg_hi:[1,1,1]
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v5, v5 neg_lo:[0,1,1] neg_hi:[0,1,1]
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v4, v4 neg_lo:[0,1,1] neg_hi:[0,1,1]
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v4 neg_lo:[1,1,1] neg_hi:[1,1,1]
+; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v5 neg_lo:[1,1,1] neg_hi:[1,1,1]
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fneg = fneg <4 x half> %a
%b.fneg = fneg <4 x half> %b
@@ -3008,10 +2945,8 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) {
; GFX950-LABEL: v_fmaximum3_v4f16__inlineimm1:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, 2.0, 2.0 op_sel_hi:[1,0,0]
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0]
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 2.0, v2 op_sel_hi:[1,0,1]
+; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, 2.0, v3 op_sel_hi:[1,0,1]
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>)
%max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c)
@@ -3079,10 +3014,8 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) {
; GFX950-LABEL: v_fmaximum3_v4f16__inlineimm2:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, 4.0, 4.0 op_sel_hi:[1,0,0]
-; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0]
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, 4.0 op_sel_hi:[1,1,0]
+; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, 4.0 op_sel_hi:[1,1,0]
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
%max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> <half 4.0, half 4.0, half 4.0, half 4.0>)
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index 8ba73071d9adb..34d7e5acb7896 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -1269,9 +1269,7 @@ define half @v_fminimum3_f16(half %a, half %b, half %c) {
; GFX950-LABEL: v_fminimum3_f16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.minimum.f16(half %a, half %b)
%max1 = call half @llvm.minimum.f16(half %max0, half %c)
@@ -1306,9 +1304,7 @@ define half @v_fminimum3_f16_commute(half %a, half %b, half %c) {
; GFX950-LABEL: v_fminimum3_f16_commute:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v2, v0, v0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v2, v0, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.minimum.f16(half %a, half %b)
%max1 = call half @llvm.minimum.f16(half %c, half %max0)
@@ -1346,10 +1342,9 @@ define amdgpu_ps i32 @s_fminimum3_f16(half inreg %a, half inreg %b, half inreg %
;
; GFX950-LABEL: s_fminimum3_f16:
; GFX950: ; %bb.0:
-; GFX950-NEXT: v_mov_b32_e32 v0, s0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s1, s1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s2, s2
+; GFX950-NEXT: v_mov_b32_e32 v0, s1
+; GFX950-NEXT: v_mov_b32_e32 v1, s2
+; GFX950-NEXT: v_pk_minimum3_f16 v0, s0, v0, v1
; GFX950-NEXT: s_nop 0
; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX950-NEXT: s_nop 0
@@ -1392,9 +1387,7 @@ define half @v_fminimum3_f16_fabs0(half %a, half %b, half %c) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call half @llvm.fabs.f16(half %a)
%max0 = call half @llvm.minimum.f16(half %a.fabs, half %b)
@@ -1431,9 +1424,7 @@ define half @v_fminimum3_f16_fabs1(half %a, half %b, half %c) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%b.fabs = call half @llvm.fabs.f16(half %b)
%max0 = call half @llvm.minimum.f16(half %a, half %b.fabs)
@@ -1470,9 +1461,7 @@ define half @v_fminimum3_f16_fabs2(half %a, half %b, half %c) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%c.fabs = call half @llvm.fabs.f16(half %c)
%max0 = call half @llvm.minimum.f16(half %a, half %b)
@@ -1511,9 +1500,7 @@ define half @v_fminimum3_f16_fabs_all(half %a, half %b, half %c) {
; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1
; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call half @llvm.fabs.f16(half %a)
%b.fabs = call half @llvm.fabs.f16(half %b)
@@ -1554,9 +1541,7 @@ define half @v_fminimum3_f16_fneg_all(half %a, half %b, half %c) {
; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fneg = fneg half %a
%b.fneg = fneg half %b
@@ -1597,9 +1582,7 @@ define half @v_fminimum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
; GFX950-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX950-NEXT: v_or_b32_e32 v1, 0x8000, v1
; GFX950-NEXT: v_or_b32_e32 v2, 0x8000, v2
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call half @llvm.fabs.f16(half %a)
%b.fabs = call half @llvm.fabs.f16(half %b)
@@ -1641,9 +1624,7 @@ define half @v_fminimum3_f16_fneg0(half %a, half %b, half %c) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fneg = fneg half %a
%max0 = call half @llvm.minimum.f16(half %a.fneg, half %b)
@@ -1680,9 +1661,7 @@ define half @v_fminimum3_f16_fneg1(half %a, half %b, half %c) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%b.fneg = fneg half %b
%max0 = call half @llvm.minimum.f16(half %a, half %b.fneg)
@@ -1719,9 +1698,7 @@ define half @v_fminimum3_f16_fneg2(half %a, half %b, half %c) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%c.fneg = fneg half %c
%max0 = call half @llvm.minimum.f16(half %a, half %b)
@@ -1758,9 +1735,7 @@ define half @v_fminimum3_f16_const0(half %b, half %c) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_movk_i32 s0, 0x4800
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.minimum.f16(half 8.0, half %b)
%max1 = call half @llvm.minimum.f16(half %max0, half %c)
@@ -1795,9 +1770,8 @@ define half @v_fminimum3_f16__const2(half %a, half %b) {
; GFX950-LABEL: v_fminimum3_f16__const2:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
; GFX950-NEXT: s_movk_i32 s0, 0x4800
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, s0
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.minimum.f16(half %a, half %b)
%max1 = call half @llvm.minimum.f16(half %max0, half 8.0)
@@ -1832,9 +1806,7 @@ define half @v_fminimum3_f16_inlineimm0(half %b, half %c) {
; GFX950-LABEL: v_fminimum3_f16_inlineimm0:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, 4.0
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.minimum.f16(half 4.0, half %b)
%max1 = call half @llvm.minimum.f16(half %max0, half %c)
@@ -1869,9 +1841,7 @@ define half @v_fminimum3_f16__inlineimm(half %a, half %b) {
; GFX950-LABEL: v_fminimum3_f16__inlineimm:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, 4.0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, 4.0
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.minimum.f16(half %a, half %b)
%max1 = call half @llvm.minimum.f16(half %max0, half 4.0)
@@ -1909,9 +1879,8 @@ define half @v_fminimum3_f16_const1_const2(half %a) {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_movk_i32 s0, 0x4800
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0
-; GFX950-NEXT: s_movk_i32 s0, 0x4c00
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, s0
+; GFX950-NEXT: v_mov_b32_e32 v1, 0x4c00
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s0, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.minimum.f16(half %a, half 8.0)
%max1 = call half @llvm.minimum.f16(half %max0, half 16.0)
@@ -1959,9 +1928,7 @@ define <2 x half> @v_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c
; GFX950-LABEL: v_fminimum3_v2f16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v2, v0, v0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v2, v0, v1
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
%max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %c, <2 x half> %max0)
@@ -2009,9 +1976,7 @@ define <2 x half> @v_fminimum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x
; GFX950-LABEL: v_fminimum3_v2f16_commute:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
%max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c)
@@ -2067,9 +2032,7 @@ define <2 x half> @v_fminimum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2
; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
%b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b)
@@ -2120,9 +2083,7 @@ define <2 x half> @v_fminimum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2
; GFX950-LABEL: v_fminimum3_v2f16__fneg_all:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 neg_lo:[1,1,1] neg_hi:[1,1,1]
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 neg_lo:[0,1,1] neg_hi:[0,1,1]
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v2 neg_lo:[1,1,1] neg_hi:[1,1,1]
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fneg = fneg <2 x half> %a
%b.fneg = fneg <2 x half> %b
@@ -2173,9 +2134,7 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) {
; GFX950-LABEL: v_fminimum3_v2f16__inlineimm1:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0]
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1]
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 2.0>)
%max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c)
@@ -2223,9 +2182,7 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) {
; GFX950-LABEL: v_fminimum3_v2f16__inlineimm2:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0]
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, 4.0 op_sel_hi:[1,1,0]
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
%max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> <half 4.0, half 4.0>)
@@ -2287,10 +2244,8 @@ define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c
; GFX950-LABEL: v_fminimum3_v3f16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v5, v1, v1
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v4, v0, v0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v4, v0, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v1, v5, v1, v3
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
%max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %c, <3 x half> %max0)
@@ -2352,10 +2307,8 @@ define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x
; GFX950-LABEL: v_fminimum3_v3f16_commute:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v5, v5
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v4, v4
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v4
+; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v5
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
%max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c)
@@ -2428,16 +2381,14 @@ define <3 x half> @v_fminimum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3
; GFX950-LABEL: v_fminimum3_v3f16__fabs_all:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
-; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX950-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3
+; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
; GFX950-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v5
; GFX950-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v4
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v5, v5
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v4, v4
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v4
+; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v5
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a)
%b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b)
@@ -2502,10 +2453,8 @@ define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3
; GFX950-LABEL: v_fminimum3_v3f16__fneg_all:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 neg_lo:[1,1,1] neg_hi:[1,1,1]
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 neg_lo:[1,1,1] neg_hi:[1,1,1]
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v5, v5 neg_lo:[0,1,1] neg_hi:[0,1,1]
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v4, v4 neg_lo:[0,1,1] neg_hi:[0,1,1]
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v4 neg_lo:[1,1,1] neg_hi:[1,1,1]
+; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v5 neg_lo:[1,1,1] neg_hi:[1,1,1]
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fneg = fneg <3 x half> %a
%b.fneg = fneg <3 x half> %b
@@ -2567,10 +2516,8 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
; GFX950-LABEL: v_fminimum3_v3f16__inlineimm1:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, 2.0, 2.0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0]
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 2.0, v2 op_sel_hi:[1,0,1]
+; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, 2.0, v3
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> <half 2.0, half 2.0, half 2.0>)
%max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c)
@@ -2632,10 +2579,8 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) {
; GFX950-LABEL: v_fminimum3_v3f16__inlineimm2:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, 4.0, 4.0
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0]
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, 4.0 op_sel_hi:[1,1,0]
+; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, 4.0
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
%max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> <half 4.0, half 4.0, half 4.0>)
@@ -2703,10 +2648,8 @@ define <4 x half> @v_fminimum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c
; GFX950-LABEL: v_fminimum3_v4f16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v5, v1, v1
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v4, v0, v0
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v4, v0, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v1, v5, v1, v3
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
%max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %c, <4 x half> %max0)
@@ -2774,10 +2717,8 @@ define <4 x half> @v_fminimum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x
; GFX950-LABEL: v_fminimum3_v4f16_commute:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v5, v5
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v4, v4
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v4
+; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v5
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
%max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c)
@@ -2856,16 +2797,14 @@ define <4 x half> @v_fminimum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4
; GFX950-LABEL: v_fminimum3_v4f16__fabs_all:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
-; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX950-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3
+; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
; GFX950-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v5
; GFX950-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v4
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v5, v5
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v4, v4
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v4
+; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v5
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a)
%b.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %b)
@@ -2936,10 +2875,8 @@ define <4 x half> @v_fminimum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4
; GFX950-LABEL: v_fminimum3_v4f16__fneg_all:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 neg_lo:[1,1,1] neg_hi:[1,1,1]
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 neg_lo:[1,1,1] neg_hi:[1,1,1]
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v5, v5 neg_lo:[0,1,1] neg_hi:[0,1,1]
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v4, v4 neg_lo:[0,1,1] neg_hi:[0,1,1]
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v4 neg_lo:[1,1,1] neg_hi:[1,1,1]
+; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v5 neg_lo:[1,1,1] neg_hi:[1,1,1]
; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fneg = fneg <4 x half> %a
%b.fneg = fneg <4 x half> %b
@@ -3008,10 +2945,8 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) {
; GFX950-LABEL: v_fminimum3_v4f16__inlineimm1:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, 2.0, 2.0 op_sel_hi:[1,0,0]
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0]
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 2.0, v2 op_sel_hi:[1,0,1]
+; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, 2.0, v3 op_sel_hi:[1,0,1]
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>)
%max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c)
@@ -3079,10 +3014,8 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) {
; GFX950-LABEL: v_fminimum3_v4f16__inlineimm2:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2
-; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, 4.0, 4.0 op_sel_hi:[1,0,0]
-; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0]
+; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, 4.0 op_sel_hi:[1,1,0]
+; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, 4.0 op_sel_hi:[1,1,0]
; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
%max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> <half 4.0, half 4.0, half 4.0, half 4.0>)
More information about the llvm-commits
mailing list