[llvm] [AMDGPU][True16][CodeGen] true16 codegen for valu op (PR #124797)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 28 09:29:35 PST 2025
https://github.com/broxigarchen created https://github.com/llvm/llvm-project/pull/124797
true16 selection for valu ops, update the codegen test
>From e6572065040ce391c98a20b4bfebf88183d137e1 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Tue, 28 Jan 2025 10:24:04 -0500
Subject: [PATCH] true16 selection for valu op
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 54 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll | 376 ++++--
llvm/test/CodeGen/AMDGPU/fma.f16.ll | 324 ++++--
llvm/test/CodeGen/AMDGPU/fmul.f16.ll | 179 ++-
.../CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll | 4 +
.../CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll | 4 +
.../CodeGen/AMDGPU/llvm.amdgcn.sqrt.f16.ll | 76 ++
llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll | 199 ++--
llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 1035 +++++++++++------
llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll | 119 +-
llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll | 119 +-
11 files changed, 1739 insertions(+), 750 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 5727d14ec49e8a..23df0844866374 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -814,7 +814,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (Fix16BitCopies) {
if (((Size == 16) != (SrcSize == 16))) {
// Non-VGPR Src and Dst will later be expanded back to 32 bits.
- assert(ST.hasTrue16BitInsts());
+ assert(ST.useRealTrue16Insts());
MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
RegToFix = SubReg;
@@ -988,7 +988,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
- if (ST.hasTrue16BitInsts()) {
+ if (ST.useRealTrue16Insts()) {
if (IsSGPRSrc) {
assert(SrcLow);
SrcReg = NewSrcReg;
@@ -5559,9 +5559,11 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
: AMDGPU::V_FLOOR_F16_fake16_e64;
case AMDGPU::S_TRUNC_F16:
- return AMDGPU::V_TRUNC_F16_fake16_e64;
+ return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64
+ : AMDGPU::V_TRUNC_F16_fake16_e64;
case AMDGPU::S_RNDNE_F16:
- return AMDGPU::V_RNDNE_F16_fake16_e64;
+ return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64
+ : AMDGPU::V_RNDNE_F16_fake16_e64;
case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
@@ -5569,20 +5571,32 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
- case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
- case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
- case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
- case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
+ case AMDGPU::S_ADD_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64
+ : AMDGPU::V_ADD_F16_fake16_e64;
+ case AMDGPU::S_SUB_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64
+ : AMDGPU::V_SUB_F16_fake16_e64;
+ case AMDGPU::S_MIN_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64
+ : AMDGPU::V_MIN_F16_fake16_e64;
+ case AMDGPU::S_MAX_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64
+ : AMDGPU::V_MAX_F16_fake16_e64;
case AMDGPU::S_MINIMUM_F16:
return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
: AMDGPU::V_MINIMUM_F16_fake16_e64;
case AMDGPU::S_MAXIMUM_F16:
return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
: AMDGPU::V_MAXIMUM_F16_fake16_e64;
- case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
+ case AMDGPU::S_MUL_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
+ : AMDGPU::V_MUL_F16_fake16_e64;
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
- case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
+ case AMDGPU::S_FMAC_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
+ : AMDGPU::V_FMAC_F16_fake16_e64;
case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
@@ -5642,15 +5656,25 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
: AMDGPU::V_CMP_NLT_F16_fake16_e64;
case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
- case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64;
+ case AMDGPU::V_S_EXP_F16_e64:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64
+ : AMDGPU::V_EXP_F16_fake16_e64;
case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
- case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_fake16_e64;
+ case AMDGPU::V_S_LOG_F16_e64:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64
+ : AMDGPU::V_LOG_F16_fake16_e64;
case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
- case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_fake16_e64;
+ case AMDGPU::V_S_RCP_F16_e64:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64
+ : AMDGPU::V_RCP_F16_fake16_e64;
case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
- case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_fake16_e64;
+ case AMDGPU::V_S_RSQ_F16_e64:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64
+ : AMDGPU::V_RSQ_F16_fake16_e64;
case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
- case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_fake16_e64;
+ case AMDGPU::V_S_SQRT_F16_e64:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64
+ : AMDGPU::V_SQRT_F16_fake16_e64;
}
llvm_unreachable(
"Unexpected scalar opcode without corresponding vector one!");
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
index d81faf91801b0b..0c3db94b697420 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
@@ -3,7 +3,8 @@
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define float @v_pow_f32(float %x, float %y) {
; GFX6-LABEL: v_pow_f32:
@@ -371,19 +372,33 @@ define half @v_pow_f16(half %x, half %y) {
; GFX10-NEXT: v_exp_f16_e32 v0, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_pow_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_exp_f16_e32 v0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_pow_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_pow_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_exp_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%pow = call half @llvm.pow.f16(half %x, half %y)
ret half %pow
}
@@ -474,31 +489,60 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_pow_v2f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_log_f16_e32 v2, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3
-; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: v_exp_f16_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_exp_f16_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_pow_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v3, v1 :: v_dual_mul_dx9_zero_f32 v2, v0, v2
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2
+; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_pow_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_log_f16_e32 v2, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: v_exp_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_exp_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y)
ret <2 x half> %pow
}
@@ -597,33 +641,63 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_pow_v2f16_fneg_lhs:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_log_f16_e32 v2, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_exp_f16_e32 v1, v1
-; GFX11-NEXT: v_exp_f16_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_pow_v2f16_fneg_lhs:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v3, v1 :: v_dual_mul_dx9_zero_f32 v2, v0, v2
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2
+; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_pow_v2f16_fneg_lhs:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_log_f16_e32 v2, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_exp_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: v_exp_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%x.fneg = fneg <2 x half> %x
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y)
ret <2 x half> %pow
@@ -723,32 +797,62 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_pow_v2f16_fneg_rhs:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_log_f16_e32 v2, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3
-; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: v_exp_f16_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_exp_f16_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_pow_v2f16_fneg_rhs:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v3, v1 :: v_dual_mul_dx9_zero_f32 v2, v0, v2
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2
+; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_pow_v2f16_fneg_rhs:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_log_f16_e32 v2, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: v_exp_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_exp_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%y.fneg = fneg <2 x half> %y
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg)
ret <2 x half> %pow
@@ -856,34 +960,66 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_pow_v2f16_fneg_lhs_rhs:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_log_f16_e32 v2, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_log_f16_e32 v0, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3
-; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: v_exp_f16_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_exp_f16_e32 v0, v0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_pow_v2f16_fneg_lhs_rhs:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v3, v1 :: v_dual_mul_dx9_zero_f32 v2, v0, v2
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2
+; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_pow_v2f16_fneg_lhs_rhs:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_log_f16_e32 v2, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: v_exp_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_exp_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%x.fneg = fneg <2 x half> %x
%y.fneg = fneg <2 x half> %y
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y.fneg)
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index 822d40f7349b0f..b7edd292f060f3 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -3,8 +3,10 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9,GFX9-GISEL
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-SDAG-FAKE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-GISEL-TRUE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-GISEL-FAKE16
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL
@@ -24,11 +26,32 @@ define half @test_fma(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: test_fma:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_fma:
+; GFX11-SDAG-TRUE16: ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v0.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_fma:
+; GFX11-SDAG-FAKE16: ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_fma:
+; GFX11-GISEL-TRUE16: ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_fma:
+; GFX11-GISEL-FAKE16: ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_fma:
; GFX12: ; %bb.0:
@@ -57,11 +80,31 @@ define half @test_fmac(half %x, half %y, half %z) {
; GFX10-NEXT: v_fmac_f16_e32 v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: test_fmac:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fmac_f16_e32 v0, v1, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_fmac:
+; GFX11-SDAG-TRUE16: ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_fma_f16 v0.l, v1.l, v1.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_fmac:
+; GFX11-SDAG-FAKE16: ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v0, v1, v2
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_fmac:
+; GFX11-GISEL-TRUE16: ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_fma_f16 v0.l, v1.l, v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_fmac:
+; GFX11-GISEL-FAKE16: ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v0, v1, v2
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_fmac:
; GFX12: ; %bb.0:
@@ -98,11 +141,31 @@ define half @test_fmaak(half %x, half %y, half %z) {
; GFX10-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: test_fmaak:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_fmaak:
+; GFX11-SDAG-TRUE16: ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v0.h, 0x4200
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_fmaak:
+; GFX11-SDAG-FAKE16: ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_fmaak:
+; GFX11-GISEL-TRUE16: ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, 0x4200
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_fmaak:
+; GFX11-GISEL-FAKE16: ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_fmaak:
; GFX12: ; %bb.0:
@@ -139,11 +202,31 @@ define half @test_fmamk(half %x, half %y, half %z) {
; GFX10-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: test_fmamk:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_fmamk:
+; GFX11-SDAG-TRUE16: ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, 0x4200, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_fmamk:
+; GFX11-SDAG-FAKE16: ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_fmamk:
+; GFX11-GISEL-TRUE16: ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, 0x4200, v2.l
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_fmamk:
+; GFX11-GISEL-FAKE16: ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_fmamk:
; GFX12: ; %bb.0:
@@ -208,33 +291,61 @@ define i32 @test_D139469_f16(half %arg) {
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: test_D139469_f16:
-; GFX11-SDAG: ; %bb.0: ; %bb
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x211e
-; GFX11-SDAG-NEXT: v_mul_f16_e32 v2, 0x291e, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
-; GFX11-SDAG-NEXT: v_min_f16_e32 v0, v2, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: test_D139469_f16:
-; GFX11-GISEL: ; %bb.0: ; %bb
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e
-; GFX11-GISEL-NEXT: v_mul_f16_e32 v2, 0x291e, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
-; GFX11-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v1
-; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_D139469_f16:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e
+; GFX11-SDAG-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, 0x291e, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_D139469_f16:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e
+; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
+; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v0, v2, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_D139469_f16:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e
+; GFX11-GISEL-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, 0x291e, v0.h
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.l
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_D139469_f16:
+; GFX11-GISEL-FAKE16: ; %bb.0: ; %bb
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e
+; GFX11-GISEL-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-LABEL: test_D139469_f16:
; GFX12-SDAG: ; %bb.0: ; %bb
@@ -346,44 +457,83 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: test_D139469_v2f16:
-; GFX11-SDAG: ; %bb.0: ; %bb
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x211e
-; GFX11-SDAG-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
-; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_pk_min_f16 v0, v1, v0
-; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: test_D139469_v2f16:
-; GFX11-GISEL: ; %bb.0: ; %bb
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e211e
-; GFX11-GISEL-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v0
-; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s1, 0, v1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s2, 0, v3
-; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-GISEL-NEXT: s_or_b32 s0, s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_D139469_v2f16:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x211e
+; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
+; GFX11-SDAG-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v0, v1, v0
+; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_D139469_v2f16:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x211e
+; GFX11-SDAG-FAKE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
+; GFX11-SDAG-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_D139469_v2f16:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x211e211e
+; GFX11-GISEL-TRUE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
+; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.l
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3.l
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, s1, s2
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_D139469_v2f16:
+; GFX11-GISEL-FAKE16: ; %bb.0: ; %bb
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e211e
+; GFX11-GISEL-FAKE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, s1, s2
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-LABEL: test_D139469_v2f16:
; GFX12-SDAG: ; %bb.0: ; %bb
diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
index 2cc5159c29f7ff..e9877ae5144f5a 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -2,7 +2,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,VI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_kernel void @fmul_f16(
; SI-LABEL: fmul_f16:
@@ -54,29 +55,55 @@ define amdgpu_kernel void @fmul_f16(
; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX89-NEXT: s_endpgm
;
-; GFX11-LABEL: fmul_f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s6, s10
-; GFX11-NEXT: s_mov_b32 s7, s11
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
-; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fmul_f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fmul_f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
+; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
@@ -127,23 +154,41 @@ define amdgpu_kernel void @fmul_f16_imm_a(
; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX89-NEXT: s_endpgm
;
-; GFX11-LABEL: fmul_f16_imm_a:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: v_mul_f16_e32 v0, 0x4200, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fmul_f16_imm_a:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x4200, v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fmul_f16_imm_a:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, 0x4200, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %b) {
entry:
@@ -192,23 +237,41 @@ define amdgpu_kernel void @fmul_f16_imm_b(
; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX89-NEXT: s_endpgm
;
-; GFX11-LABEL: fmul_f16_imm_b:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: v_mul_f16_e32 v0, 4.0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fmul_f16_imm_b:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 4.0, v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fmul_f16_imm_b:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, 4.0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
index b78ea1033baacc..c6ea12dd616513 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
@@ -1,10 +1,14 @@
; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
declare half @llvm.amdgcn.rcp.f16(half %a)
; GCN-LABEL: {{^}}rcp_f16
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
; VI: v_rcp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
+; GFX11-TRUE16: v_rcp_f16_e32 v[[A_F16:[0-9]+]].l, v[[A_F16]].l
+; GFX11-FAKE16: v_rcp_f16_e32 v[[A_F16:[0-9]+]], v[[A_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @rcp_f16(
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
index 8c003c96dddfe9..0924e9a5c2314b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
@@ -1,10 +1,14 @@
; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
declare half @llvm.amdgcn.rsq.f16(half %a)
; GCN-LABEL: {{^}}rsq_f16
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
; VI: v_rsq_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
+; GFX11-TRUE16: v_rsq_f16_e32 v[[A_F16:[0-9]+]].l, v[[A_F16]].l
+; GFX11-FAKE16: v_rsq_f16_e32 v[[A_F16:[0-9]+]], v[[A_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @rsq_f16(
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.f16.ll
index bbfb88a4b22a36..aa994558162db0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.f16.ll
@@ -1,6 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11-SDAG-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11-SDAG-FAKE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL-FAKE16 %s
define half @v_sqrt_f16(half %src) {
; GCN-LABEL: v_sqrt_f16:
@@ -8,6 +12,30 @@ define half @v_sqrt_f16(half %src) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_sqrt_f16_e32 v0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_sqrt_f16:
+; GFX11-SDAG-TRUE16: ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_sqrt_f16:
+; GFX11-SDAG-FAKE16: ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_sqrt_f16:
+; GFX11-GISEL-TRUE16: ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_sqrt_f16:
+; GFX11-GISEL-FAKE16: ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sqrt = call half @llvm.amdgcn.sqrt.f16(half %src)
ret half %sqrt
}
@@ -18,6 +46,30 @@ define half @v_fabs_sqrt_f16(half %src) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_sqrt_f16_e64 v0, |v0|
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_fabs_sqrt_f16:
+; GFX11-SDAG-TRUE16: ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_sqrt_f16_e64 v0.l, |v0.l|
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_fabs_sqrt_f16:
+; GFX11-SDAG-FAKE16: ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_sqrt_f16_e64 v0, |v0|
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_fabs_sqrt_f16:
+; GFX11-GISEL-TRUE16: ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_sqrt_f16_e64 v0.l, |v0.l|
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_fabs_sqrt_f16:
+; GFX11-GISEL-FAKE16: ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_sqrt_f16_e64 v0, |v0|
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fabs.src = call half @llvm.fabs.f16(half %src)
%sqrt = call half @llvm.amdgcn.sqrt.f16(half %fabs.src)
ret half %sqrt
@@ -29,6 +81,30 @@ define half @v_fneg_fabs_sqrt_f16(half %src) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_sqrt_f16_e64 v0, -|v0|
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_fneg_fabs_sqrt_f16:
+; GFX11-SDAG-TRUE16: ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_sqrt_f16_e64 v0.l, -|v0.l|
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_fneg_fabs_sqrt_f16:
+; GFX11-SDAG-FAKE16: ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_sqrt_f16_e64 v0, -|v0|
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_fneg_fabs_sqrt_f16:
+; GFX11-GISEL-TRUE16: ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_sqrt_f16_e64 v0.l, -|v0.l|
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_fneg_fabs_sqrt_f16:
+; GFX11-GISEL-FAKE16: ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_sqrt_f16_e64 v0, -|v0|
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fabs.src = call half @llvm.fabs.f16(half %src)
%neg.fabs.src = fneg half %fabs.src
%sqrt = call half @llvm.amdgcn.sqrt.f16(half %neg.fabs.src)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 0517e41e3d651b..3faf84e5d58c82 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -3,7 +3,8 @@
; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
declare half @llvm.maxnum.f16(half %a, half %b)
declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
@@ -113,32 +114,60 @@ define amdgpu_kernel void @maxnum_f16(
; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: maxnum_f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s6, s10
-; GFX11-NEXT: s_mov_b32 s7, s11
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
-; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: maxnum_f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.h, v0.h
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: maxnum_f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) #0 {
@@ -228,25 +257,45 @@ define amdgpu_kernel void @maxnum_f16_imm_a(
; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: maxnum_f16_imm_a:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v0, 0x4200, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: maxnum_f16_imm_a:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, 0x4200, v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: maxnum_f16_imm_a:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, 0x4200, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %b) #0 {
entry:
@@ -334,25 +383,45 @@ define amdgpu_kernel void @maxnum_f16_imm_b(
; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: maxnum_f16_imm_b:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v0, 4.0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: maxnum_f16_imm_b:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, 4.0, v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: maxnum_f16_imm_b:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, 4.0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) #0 {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index 0b9cb9682ea5f9..01bfe600f363d5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -1,12 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; xUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; xUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck -check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
define half @v_minimum_f16(half %src0, half %src1) {
; GFX8-LABEL: v_minimum_f16:
@@ -45,14 +46,24 @@ define half @v_minimum_f16(half %src0, half %src1) {
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_minimum_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_min_f16_e32 v2, v0, v1
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimum_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimum_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v0, v1
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_minimum_f16:
; GFX12: ; %bb.0:
@@ -86,11 +97,19 @@ define half @v_minimum_f16__nnan(half %src0, half %src1) {
; GFX10-NEXT: v_min_f16_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_minimum_f16__nnan:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimum_f16__nnan:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimum_f16__nnan:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_minimum_f16__nnan:
; GFX12: ; %bb.0:
@@ -142,14 +161,24 @@ define half @v_minimum_f16__nsz(half %src0, half %src1) {
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_minimum_f16__nsz:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_min_f16_e32 v2, v0, v1
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimum_f16__nsz:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimum_f16__nsz:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v0, v1
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_minimum_f16__nsz:
; GFX12: ; %bb.0:
@@ -183,11 +212,19 @@ define half @v_minimum_f16__nnan_nsz(half %src0, half %src1) {
; GFX10-NEXT: v_min_f16_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_minimum_f16__nnan_nsz:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimum_f16__nnan_nsz:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimum_f16__nnan_nsz:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_minimum_f16__nnan_nsz:
; GFX12: ; %bb.0:
@@ -243,15 +280,26 @@ define half @v_minimum_f16__nnan_src0(half %arg0, half %src1) {
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_minimum_f16__nnan_src0:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f16_e32 v2, v0, v1
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimum_f16__nnan_src0:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimum_f16__nnan_src0:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v0, v1
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_minimum_f16__nnan_src0:
; GFX12: ; %bb.0:
@@ -310,15 +358,25 @@ define half @v_minimum_f16__nnan_src1(half %src0, half %arg1) {
; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_minimum_f16__nnan_src1:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_min_f16_e32 v2, v0, v1
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimum_f16__nnan_src1:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.h, 1.0, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimum_f16__nnan_src1:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 1.0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v0, v1
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_minimum_f16__nnan_src1:
; GFX12: ; %bb.0:
@@ -392,18 +450,31 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: s_minimum_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_min_f16_e64 v0, s0, s1
-; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use v0
-; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: s_minimum_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, s0, s1
+; GFX11-TRUE16-NEXT: v_min_f16_e64 v0.l, s0, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: ;;#ASMSTART
+; GFX11-TRUE16-NEXT: ; use v0
+; GFX11-TRUE16-NEXT: ;;#ASMEND
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: s_minimum_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_min_f16_e64 v0, s0, s1
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: ;;#ASMSTART
+; GFX11-FAKE16-NEXT: ; use v0
+; GFX11-FAKE16-NEXT: ;;#ASMEND
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: s_minimum_f16:
; GFX12: ; %bb.0:
@@ -477,21 +548,41 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) {
; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_minimum_v2f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_min_f16 v2, v0, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo
-; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimum_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v4, v0, v1
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v0.l, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v4.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimum_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v2, v0, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_minimum_v2f16:
; GFX12: ; %bb.0:
@@ -601,21 +692,41 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) {
; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_minimum_v2f16__nsz:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_min_f16 v2, v0, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo
-; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimum_v2f16__nsz:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v4, v0, v1
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v0.l, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v4.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimum_v2f16__nsz:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v2, v0, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_minimum_v2f16__nsz:
; GFX12: ; %bb.0:
@@ -749,25 +860,53 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: s_minimum_v2f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_min_f16 v0, s0, s1
-; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
-; GFX11-NEXT: s_lshr_b32 s2, s1, 16
-; GFX11-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use v0
-; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: s_minimum_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, s3
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v2, s0, s1
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v1.l, v1.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-TRUE16-NEXT: ;;#ASMSTART
+; GFX11-TRUE16-NEXT: ; use v0
+; GFX11-TRUE16-NEXT: ;;#ASMEND
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: s_minimum_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v0, s0, s1
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: ;;#ASMSTART
+; GFX11-FAKE16-NEXT: ; use v0
+; GFX11-FAKE16-NEXT: ;;#ASMEND
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: s_minimum_v2f16:
; GFX12: ; %bb.0:
@@ -846,25 +985,47 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) {
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_minimum_v3f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_min_f16 v4, v0, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5
-; GFX11-NEXT: v_pk_min_f16 v4, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3
-; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimum_v3f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v6, v0, v2
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v1, v1, v3
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v5.l, v4.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v2.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimum_v3f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v4, v0, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v4, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_minimum_v3f16:
; GFX12: ; %bb.0:
@@ -991,25 +1152,47 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_minimum_v3f16__nsz:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_min_f16 v4, v0, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5
-; GFX11-NEXT: v_pk_min_f16 v4, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3
-; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimum_v3f16__nsz:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v6, v0, v2
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v1, v1, v3
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v5.l, v4.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v6.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v2.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimum_v3f16__nsz:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v4, v0, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v4, v1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_minimum_v3f16__nsz:
; GFX12: ; %bb.0:
@@ -1151,30 +1334,61 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) {
; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_minimum_v4f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_min_f16 v4, v1, v3
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT: v_pk_min_f16 v7, v0, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5
-; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimum_v4f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v6, v0, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v1, v1, v3
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v5.l, v4.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v0.l, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v8.l, v7.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v3.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v6.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v2.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_perm_b32 v1, v1, v3, 0x5040100
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimum_v4f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v4, v1, v3
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v7, v0, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_minimum_v4f16:
; GFX12: ; %bb.0:
@@ -1318,30 +1532,61 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) {
; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_minimum_v4f16__nsz:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_min_f16 v4, v1, v3
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT: v_pk_min_f16 v7, v0, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5
-; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimum_v4f16__nsz:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v6, v0, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v1, v1, v3
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v5.l, v4.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v0.l, v2.l
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v8.l, v7.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v3.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v6.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v2.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_perm_b32 v1, v1, v3, 0x5040100
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimum_v4f16__nsz:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v4, v1, v3
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v7, v0, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_minimum_v4f16__nsz:
; GFX12: ; %bb.0:
@@ -1535,48 +1780,97 @@ define <8 x half> @v_minimum_v8f16(<8 x half> %src0, <8 x half> %src1) {
; GFX10-NEXT: v_perm_b32 v3, v3, v10, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_minimum_v8f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_min_f16 v8, v3, v7
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7
-; GFX11-NEXT: v_pk_min_f16 v10, v2, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v2
-; GFX11-NEXT: v_pk_min_f16 v14, v1, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v8, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v12, v11
-; GFX11-NEXT: v_pk_min_f16 v11, v0, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v13, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v11
-; GFX11-NEXT: v_cndmask_b32_e32 v10, 0x7e00, v14, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x5040100
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v13, v12
-; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v15, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
-; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v14, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7
-; GFX11-NEXT: v_perm_b32 v1, v1, v10, 0x5040100
-; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimum_v8f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v3
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v10, v3, v7
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3.l, v7.l
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v2.l, v6.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v4
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v9.l, v8.l
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v8, v2, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, 0x7e00, v10.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v13, v0, v4
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2.l, v6.l
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v2, v1, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, 0x7e00, v7.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, 0x7e00, v8.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v10.l, v9.l
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v12.l, v11.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v13
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v0.l, v4.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s3, v1.l, v5.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v6.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v13.l, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v4.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v2.l, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX11-TRUE16-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_perm_b32 v1, v5, v1, 0x5040100
+; GFX11-TRUE16-NEXT: v_perm_b32 v2, v2, v6, 0x5040100
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_perm_b32 v3, v7, v3, 0x5040100
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimum_v8f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v8, v3, v7
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v10, v2, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v14, v1, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v12, v11
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v11, v0, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, 0x7e00, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v13, v12
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v10, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v9, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_minimum_v8f16:
; GFX12: ; %bb.0:
@@ -1818,90 +2112,179 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) {
; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_minimum_v16f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_min_f16 v16, v7, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v7
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v7, v15
-; GFX11-NEXT: v_pk_min_f16 v15, v6, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16
-; GFX11-NEXT: v_pk_min_f16 v20, v4, v12
-; GFX11-NEXT: v_pk_min_f16 v22, v2, v10
-; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7e00, v16, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7e00, v19, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v15
-; GFX11-NEXT: v_pk_min_f16 v14, v5, v13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v7, v16, v7, 0x5040100
-; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v15, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v15, 0x7e00, v19, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v5, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v14
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_perm_b32 v6, v15, v6, 0x5040100
-; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v14, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17
-; GFX11-NEXT: v_pk_min_f16 v17, v3, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20
-; GFX11-NEXT: v_cndmask_b32_e32 v13, 0x7e00, v19, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v17
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v14, 0x7e00, v20, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v3
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v11
-; GFX11-NEXT: v_perm_b32 v5, v13, v5, 0x5040100
-; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v17, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v20, v19
-; GFX11-NEXT: v_pk_min_f16 v19, v1, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v22
-; GFX11-NEXT: v_cndmask_b32_e32 v11, 0x7e00, v21, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v3, v11, v3, 0x5040100
-; GFX11-NEXT: v_cndmask_b32_e32 v17, 0x7e00, v22, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_pk_min_f16 v22, v0, v8
-; GFX11-NEXT: v_cndmask_b32_e32 v21, 0x7e00, v19, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v9
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v19, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v8
-; GFX11-NEXT: v_perm_b32 v1, v1, v21, 0x5040100
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v22, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v24, v23
-; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7e00, v25, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
-; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v20, vcc_lo
-; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12
-; GFX11-NEXT: v_perm_b32 v2, v2, v17, 0x5040100
-; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v18, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v4, v4, v14, 0x5040100
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimum_v16f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v18, v7, v15
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v7.l, v15.l
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v6.l, v14.l
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v19, v5, v13
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v17.l, v16.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v18
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v16, v6, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, 0x7e00, v18.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, 0x7e00, v15.l, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v5
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v5.l, v13.l
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v4.l, v12.l
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v13, v4, v12
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v15.l, v14.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, 0x7e00, v16.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v18.l, v17.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v19
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, 0x7e00, v14.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, 0x7e00, v19.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v15.l, v12.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v2.l, v10.l
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v16, v2, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, 0x7e00, v5.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, 0x7e00, v13.l, s2
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v13, v3, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, 0x7e00, v12.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3.l, v11.l
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v15.l, v14.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, 0x7e00, v16.l, s0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v0
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v18, v0, v8
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v2.l, v10.l
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v2, v1, v9
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v14.l, v12.l
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s3, v17.l, v15.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v18
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s4, v0.l, v8.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s5, v1.l, v9.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v10.l, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v18.l, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v8.l, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v2.l, s5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v9.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, 0x7e00, v11.l, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, 0x7e00, v13.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
+; GFX11-TRUE16-NEXT: v_perm_b32 v1, v9, v1, 0x5040100
+; GFX11-TRUE16-NEXT: v_perm_b32 v2, v2, v10, 0x5040100
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.h
+; GFX11-TRUE16-NEXT: v_perm_b32 v3, v8, v9, 0x5040100
+; GFX11-TRUE16-NEXT: v_perm_b32 v4, v10, v11, 0x5040100
+; GFX11-TRUE16-NEXT: v_perm_b32 v5, v5, v12, 0x5040100
+; GFX11-TRUE16-NEXT: v_perm_b32 v6, v13, v6, 0x5040100
+; GFX11-TRUE16-NEXT: v_perm_b32 v7, v14, v7, 0x5040100
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimum_v16f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v16, v7, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v15
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v7
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v7, v15
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v15, v6, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v20, v4, v12
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v22, v2, v10
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, 0x7e00, v16, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v6
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, 0x7e00, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v14
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v15
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v14, v5, v13
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v16, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, 0x7e00, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v5, v13
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v15, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v14, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v17, v3, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, 0x7e00, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v11
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v17
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, 0x7e00, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v3
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v11
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v13, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v17, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v20, v19
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v19, v1, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v22
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, 0x7e00, v21, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v11, v3, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, 0x7e00, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: v_pk_min_f16 v22, v0, v8
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, 0x7e00, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v9
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v19, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v8
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v21, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v22, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v24, v23
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, 0x7e00, v25, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v20, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v17, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v18, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v14, 0x5040100
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_minimum_v16f16:
; GFX12: ; %bb.0:
@@ -1922,5 +2305,3 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) {
%op = call <16 x half> @llvm.minimum.v16f16(<16 x half> %src0, <16 x half> %src1)
ret <16 x half> %op
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 795ed6d542a139..3a2bf9d0094601 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -2,7 +2,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,VI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-FAKE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12 %s
declare half @llvm.rint.f16(half %a)
@@ -47,23 +48,41 @@ define amdgpu_kernel void @rint_f16(
; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX89-NEXT: s_endpgm
;
-; GFX11-LABEL: rint_f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_rndne_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: rint_f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_rndne_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: rint_f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_rndne_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_endpgm
;
; GFX12-LABEL: rint_f16:
; GFX12: ; %bb.0: ; %entry
@@ -166,27 +185,49 @@ define amdgpu_kernel void @rint_v2f16(
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: rint_v2f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_rndne_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_rndne_f16_e32 v1, v1
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: rint_v2f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT: v_rndne_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_rndne_f16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: rint_v2f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_rndne_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_rndne_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_endpgm
;
; GFX12-LABEL: rint_v2f16:
; GFX12: ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index 0d58afd1812dea..c1ba985d374530 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
declare half @llvm.trunc.f16(half %a)
@@ -46,23 +47,41 @@ define amdgpu_kernel void @trunc_f16(
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: trunc_f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_trunc_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: trunc_f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: trunc_f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_trunc_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_endpgm
;
; GFX12-LABEL: trunc_f16:
; GFX12: ; %bb.0: ; %entry
@@ -145,27 +164,49 @@ define amdgpu_kernel void @trunc_v2f16(
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: trunc_v2f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_trunc_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_trunc_f16_e32 v1, v1
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: trunc_v2f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: trunc_v2f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_trunc_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_trunc_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_endpgm
;
; GFX12-LABEL: trunc_v2f16:
; GFX12: ; %bb.0: ; %entry
More information about the llvm-commits
mailing list