[llvm] Main merge true16 codegen fma (under dev) (PR #122950)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 21 19:25:59 PST 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/122950
>From cfd2c8f964361452787ee404b63dd8b4c77e2ab2 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Tue, 14 Jan 2025 00:59:51 -0500
Subject: [PATCH] true16 code pattern for fma
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 2 +
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 74 ++--
llvm/lib/Target/AMDGPU/SIInstructions.td | 8 +
.../Target/AMDGPU/SIShrinkInstructions.cpp | 17 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll | 68 ++--
llvm/test/CodeGen/AMDGPU/fma.f16.ll | 328 +++++++++++++-----
6 files changed, 360 insertions(+), 137 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 2bc19137b1ca0f..76720076a19c2a 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -198,6 +198,8 @@ static unsigned macToMad(unsigned Opc) {
return AMDGPU::V_FMA_F32_e64;
case AMDGPU::V_FMAC_F16_e64:
return AMDGPU::V_FMA_F16_gfx9_e64;
+ case AMDGPU::V_FMAC_F16_t16_e64:
+ return AMDGPU::V_FMA_F16_gfx9_t16_e64;
case AMDGPU::V_FMAC_F16_fake16_e64:
return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
case AMDGPU::V_FMAC_LEGACY_F32_e64:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 8fc32d9e60bf20..6b27c6a9878485 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3486,6 +3486,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
// Don't fold if we are using source or output modifiers. The new VOP2
// instructions don't have them.
@@ -3506,6 +3507,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
bool IsFMA =
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
@@ -3539,16 +3541,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
- : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
+ : ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
+ ? AMDGPU::V_FMAMK_F16_t16
+ : AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16)
: (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
- // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
- // would also require restricting their register classes. For now
- // just bail out.
- if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
+ // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
+ // restricting their register classes. For now just bail out.
+ if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
+ NewOpc == AMDGPU::V_FMAMK_F16_fake16)
return false;
const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
@@ -3563,7 +3568,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Src0->setIsKill(RegSrc->isKill());
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
- Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -3618,23 +3623,26 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
- : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
+ : ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
+ ? AMDGPU::V_FMAAK_F16_t16
+ : AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16)
: (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
- // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
- // would also require restricting their register classes. For now
- // just bail out.
- if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
+ // V_FMAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
+ // restricting their register classes. For now just bail out.
+ if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
+ NewOpc == AMDGPU::V_FMAAK_F16_fake16)
return false;
// FIXME: This would be a lot easier if we could return a new instruction
// instead of having to modify in place.
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
- Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -3821,8 +3829,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
return AMDGPU::V_FMA_LEGACY_F32_e64;
case AMDGPU::V_FMAC_F16_e32:
case AMDGPU::V_FMAC_F16_e64:
+ case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
- return ST.hasTrue16BitInsts() ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
+ return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
+ ? AMDGPU::V_FMA_F16_gfx9_t16_e64
+ : AMDGPU::V_FMA_F16_gfx9_fake16_e64
: AMDGPU::V_FMA_F16_gfx9_e64;
case AMDGPU::V_FMAC_F32_e32:
case AMDGPU::V_FMAC_F32_e64:
@@ -3888,19 +3899,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return MIB;
}
- assert(
- Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
- "V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
- "pre-RA");
+ assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
+ Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
+ "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
+ "present "
+ "pre-RA");
// Handle MAC/FMAC.
bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
@@ -3915,6 +3929,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return nullptr;
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_FMAC_F16_e64:
+ case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_LEGACY_F32_e64:
@@ -4000,8 +4015,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
int64_t Imm;
if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
unsigned NewOpc =
- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
- : AMDGPU::V_FMAAK_F16)
+ IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts()
+ ? ST.useRealTrue16Insts()
+ ? AMDGPU::V_FMAAK_F16_t16
+ : AMDGPU::V_FMAAK_F16_fake16
+ : AMDGPU::V_FMAAK_F16)
: AMDGPU::V_FMAAK_F32)
: (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
if (pseudoToMCOpcode(NewOpc) != -1) {
@@ -4018,11 +4036,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return MIB;
}
}
- unsigned NewOpc =
- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
- : AMDGPU::V_FMAMK_F16)
- : AMDGPU::V_FMAMK_F32)
- : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
+ unsigned NewOpc = IsFMA
+ ? (IsF16 ? (ST.hasTrue16BitInsts()
+ ? ST.useRealTrue16Insts()
+ ? AMDGPU::V_FMAMK_F16_t16
+ : AMDGPU::V_FMAMK_F16_fake16
+ : AMDGPU::V_FMAMK_F16)
+ : AMDGPU::V_FMAMK_F32)
+ : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
if (pseudoToMCOpcode(NewOpc) != -1) {
MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
@@ -4468,6 +4489,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_LEGACY_F32_e64:
case AMDGPU::V_FMAC_F16_e64:
+ case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
case AMDGPU::V_FMAC_F32_e64:
case AMDGPU::V_FMAC_F64_e64:
@@ -5520,7 +5542,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
- case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
+ case AMDGPU::S_FMAC_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
+ : AMDGPU::V_FMAC_F16_fake16_e64;
case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 40a20fa9cb15ea..ae243ac0aaf5fa 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3237,6 +3237,14 @@ def : GCNPat <
(V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2)
>;
+let True16Predicate = UseRealTrue16Insts in
+def : GCNPat <
+ (fma (f16 (VOP3NoMods f16:$src0)),
+ (f16 (VOP3NoMods f16:$src1)),
+ (f16 (VOP3NoMods f16:$src2))),
+ (V_FMAC_F16_t16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
+ SRCMODS.NONE, $src2)
+>;
let True16Predicate = UseFakeTrue16Insts in
def : GCNPat <
(fma (f16 (VOP3NoMods f16:$src0)),
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 979812e07fc3f7..f03cde455f295c 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -455,9 +455,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
break;
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
+ NewOpcode = AMDGPU::V_FMAAK_F16;
+ break;
+ case AMDGPU::V_FMA_F16_gfx9_t16_e64:
+ NewOpcode = AMDGPU::V_FMAAK_F16_t16;
+ break;
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
- NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
- : AMDGPU::V_FMAAK_F16;
+ NewOpcode = AMDGPU::V_FMAAK_F16_fake16;
break;
}
}
@@ -485,9 +489,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
break;
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
+ NewOpcode = AMDGPU::V_FMAMK_F16;
+ break;
+ case AMDGPU::V_FMA_F16_gfx9_t16_e64:
+ NewOpcode = AMDGPU::V_FMAMK_F16_t16;
+ break;
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
- NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
- : AMDGPU::V_FMAMK_F16;
+ NewOpcode = AMDGPU::V_FMAMK_F16_fake16;
break;
}
}
@@ -959,6 +967,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 ||
+ MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) {
shrinkMadFma(MI);
continue;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
index 99e6c5d06a0e19..0b09cabf25a161 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
@@ -3,7 +3,8 @@
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define float @v_fma_f32(float %x, float %y, float %z) {
; GFX6-LABEL: v_fma_f32:
@@ -107,11 +108,18 @@ define half @v_fma_f16(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fma_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fma_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fma_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fma = call half @llvm.fma.f16(half %x, half %y, half %z)
ret half %fma
}
@@ -145,11 +153,17 @@ define half @v_fma_f16_fneg_lhs(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, -v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fma_f16_fneg_lhs:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v0, -v0, v1, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fma_f16_fneg_lhs:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fma_f16_fneg_lhs:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_fma_f16 v0, -v0, v1, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%neg.x = fneg half %x
%fma = call half @llvm.fma.f16(half %neg.x, half %y, half %z)
ret half %fma
@@ -184,11 +198,17 @@ define half @v_fma_f16_fneg_rhs(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, v0, -v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fma_f16_fneg_rhs:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v0, v0, -v1, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fma_f16_fneg_rhs:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, -v1.l, v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fma_f16_fneg_rhs:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, -v1, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%neg.y = fneg half %y
%fma = call half @llvm.fma.f16(half %x, half %neg.y, half %z)
ret half %fma
@@ -223,11 +243,17 @@ define half @v_fma_f16_fneg_add(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, v0, v1, -v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fma_f16_fneg_add:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v0, v0, v1, -v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fma_f16_fneg_add:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, -v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fma_f16_fneg_add:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, -v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%neg.z = fneg half %z
%fma = call half @llvm.fma.f16(half %x, half %y, half %neg.z)
ret half %fma
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index 822d40f7349b0f..30a8f5733d14b6 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -3,8 +3,10 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9,GFX9-GISEL
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-SDAG-FAKE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-GISEL-TRUE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-GISEL-FAKE16
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL
@@ -24,11 +26,34 @@ define half @test_fma(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: test_fma:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_fma:
+; GFX11-SDAG-TRUE16: ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v0.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_fma:
+; GFX11-SDAG-FAKE16: ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_fma:
+; GFX11-GISEL-TRUE16: ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_fma:
+; GFX11-GISEL-FAKE16: ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_fma:
; GFX12: ; %bb.0:
@@ -57,11 +82,31 @@ define half @test_fmac(half %x, half %y, half %z) {
; GFX10-NEXT: v_fmac_f16_e32 v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: test_fmac:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fmac_f16_e32 v0, v1, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_fmac:
+; GFX11-SDAG-TRUE16: ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_fmac:
+; GFX11-SDAG-FAKE16: ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v0, v1, v2
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_fmac:
+; GFX11-GISEL-TRUE16: ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_fmac:
+; GFX11-GISEL-FAKE16: ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v0, v1, v2
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_fmac:
; GFX12: ; %bb.0:
@@ -98,11 +143,31 @@ define half @test_fmaak(half %x, half %y, half %z) {
; GFX10-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: test_fmaak:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_fmaak:
+; GFX11-SDAG-TRUE16: ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_fmaak_f16 v0.l, v0.l, v0.h, 0x4200
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_fmaak:
+; GFX11-SDAG-FAKE16: ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_fmaak:
+; GFX11-GISEL-TRUE16: ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_fmaak_f16 v0.l, v0.l, v1.l, 0x4200
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_fmaak:
+; GFX11-GISEL-FAKE16: ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_fmaak:
; GFX12: ; %bb.0:
@@ -139,11 +204,33 @@ define half @test_fmamk(half %x, half %y, half %z) {
; GFX10-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: test_fmamk:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_fmamk:
+; GFX11-SDAG-TRUE16: ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_fmamk_f16 v0.l, v0.l, 0x4200, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_fmamk:
+; GFX11-SDAG-FAKE16: ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_fmamk:
+; GFX11-GISEL-TRUE16: ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v2.l, 0x4200, v0.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_fmamk:
+; GFX11-GISEL-FAKE16: ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_fmamk:
; GFX12: ; %bb.0:
@@ -208,33 +295,61 @@ define i32 @test_D139469_f16(half %arg) {
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: test_D139469_f16:
-; GFX11-SDAG: ; %bb.0: ; %bb
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x211e
-; GFX11-SDAG-NEXT: v_mul_f16_e32 v2, 0x291e, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
-; GFX11-SDAG-NEXT: v_min_f16_e32 v0, v2, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: test_D139469_f16:
-; GFX11-GISEL: ; %bb.0: ; %bb
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e
-; GFX11-GISEL-NEXT: v_mul_f16_e32 v2, 0x291e, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
-; GFX11-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v1
-; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_D139469_f16:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e
+; GFX11-SDAG-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_D139469_f16:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e
+; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
+; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v0, v2, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_D139469_f16:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e
+; GFX11-GISEL-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.h
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_D139469_f16:
+; GFX11-GISEL-FAKE16: ; %bb.0: ; %bb
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e
+; GFX11-GISEL-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-LABEL: test_D139469_f16:
; GFX12-SDAG: ; %bb.0: ; %bb
@@ -346,44 +461,83 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: test_D139469_v2f16:
-; GFX11-SDAG: ; %bb.0: ; %bb
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x211e
-; GFX11-SDAG-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
-; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_pk_min_f16 v0, v1, v0
-; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: test_D139469_v2f16:
-; GFX11-GISEL: ; %bb.0: ; %bb
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e211e
-; GFX11-GISEL-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v0
-; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s1, 0, v1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s2, 0, v3
-; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-GISEL-NEXT: s_or_b32 s0, s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_D139469_v2f16:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x211e
+; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
+; GFX11-SDAG-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v0, v1, v0
+; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_D139469_v2f16:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x211e
+; GFX11-SDAG-FAKE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
+; GFX11-SDAG-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_D139469_v2f16:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x211e211e
+; GFX11-GISEL-TRUE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
+; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.l
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3.l
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, s1, s2
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_D139469_v2f16:
+; GFX11-GISEL-FAKE16: ; %bb.0: ; %bb
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e211e
+; GFX11-GISEL-FAKE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, s1, s2
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-LABEL: test_D139469_v2f16:
; GFX12-SDAG: ; %bb.0: ; %bb
More information about the llvm-commits
mailing list