[llvm] [AMDGPU][True16][CodeGen] true16 codegen pattern for fma (PR #122950)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 13 07:52:24 PST 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/122950
>From 5b2834bec565e106845cf8ddba4a574d3f31410d Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Tue, 14 Jan 2025 00:59:51 -0500
Subject: [PATCH 1/3] true16 code pattern for fma
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 2 +
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 74 ++--
llvm/lib/Target/AMDGPU/SIInstructions.td | 8 +
.../Target/AMDGPU/SIShrinkInstructions.cpp | 17 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll | 68 ++--
.../CodeGen/AMDGPU/fix-sgpr-copies-f16.mir | 3 +-
llvm/test/CodeGen/AMDGPU/fma.f16.ll | 328 +++++++++++++-----
.../CodeGen/AMDGPU/shrink-mad-fma-fake16.mir | 242 +++++++++++++
.../CodeGen/AMDGPU/shrink-mad-fma-gfx10.mir | 258 ++++++++++++++
llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir | 115 +-----
10 files changed, 871 insertions(+), 244 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/shrink-mad-fma-fake16.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/shrink-mad-fma-gfx10.mir
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 2bc19137b1ca0..76720076a19c2 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -198,6 +198,8 @@ static unsigned macToMad(unsigned Opc) {
return AMDGPU::V_FMA_F32_e64;
case AMDGPU::V_FMAC_F16_e64:
return AMDGPU::V_FMA_F16_gfx9_e64;
+ case AMDGPU::V_FMAC_F16_t16_e64:
+ return AMDGPU::V_FMA_F16_gfx9_t16_e64;
case AMDGPU::V_FMAC_F16_fake16_e64:
return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
case AMDGPU::V_FMAC_LEGACY_F32_e64:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 1d98d68a2ea5d..aa3cff6b9442c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3544,6 +3544,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
// Don't fold if we are using source or output modifiers. The new VOP2
// instructions don't have them.
@@ -3564,6 +3565,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
bool IsFMA =
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
@@ -3597,16 +3599,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
- : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
+ : ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
+ ? AMDGPU::V_FMAMK_F16_t16
+ : AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16)
: (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
- // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
- // would also require restricting their register classes. For now
- // just bail out.
- if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
+ // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
+ // restricting their register classes. For now just bail out.
+ if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
+ NewOpc == AMDGPU::V_FMAMK_F16_fake16)
return false;
const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
@@ -3621,7 +3626,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Src0->setIsKill(RegSrc->isKill());
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
- Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -3676,23 +3681,26 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
- : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
+ : ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
+ ? AMDGPU::V_FMAAK_F16_t16
+ : AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16)
: (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
- // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
- // would also require restricting their register classes. For now
- // just bail out.
- if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
+ // V_FMAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
+ // restricting their register classes. For now just bail out.
+ if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
+ NewOpc == AMDGPU::V_FMAAK_F16_fake16)
return false;
// FIXME: This would be a lot easier if we could return a new instruction
// instead of having to modify in place.
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
- Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -3879,8 +3887,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
return AMDGPU::V_FMA_LEGACY_F32_e64;
case AMDGPU::V_FMAC_F16_e32:
case AMDGPU::V_FMAC_F16_e64:
+ case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
- return ST.hasTrue16BitInsts() ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
+ return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts()
+ ? AMDGPU::V_FMA_F16_gfx9_t16_e64
+ : AMDGPU::V_FMA_F16_gfx9_fake16_e64
: AMDGPU::V_FMA_F16_gfx9_e64;
case AMDGPU::V_FMAC_F32_e32:
case AMDGPU::V_FMAC_F32_e64:
@@ -3946,19 +3957,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return MIB;
}
- assert(
- Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
- "V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
- "pre-RA");
+ assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
+ Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
+ "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
+ "present "
+ "pre-RA");
// Handle MAC/FMAC.
bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
@@ -3973,6 +3987,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return nullptr;
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_FMAC_F16_e64:
+ case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_LEGACY_F32_e64:
@@ -4058,8 +4073,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
int64_t Imm;
if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
unsigned NewOpc =
- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
- : AMDGPU::V_FMAAK_F16)
+ IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts()
+ ? ST.useRealTrue16Insts()
+ ? AMDGPU::V_FMAAK_F16_t16
+ : AMDGPU::V_FMAAK_F16_fake16
+ : AMDGPU::V_FMAAK_F16)
: AMDGPU::V_FMAAK_F32)
: (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
if (pseudoToMCOpcode(NewOpc) != -1) {
@@ -4076,11 +4094,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return MIB;
}
}
- unsigned NewOpc =
- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
- : AMDGPU::V_FMAMK_F16)
- : AMDGPU::V_FMAMK_F32)
- : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
+ unsigned NewOpc = IsFMA
+ ? (IsF16 ? (ST.hasTrue16BitInsts()
+ ? ST.useRealTrue16Insts()
+ ? AMDGPU::V_FMAMK_F16_t16
+ : AMDGPU::V_FMAMK_F16_fake16
+ : AMDGPU::V_FMAMK_F16)
+ : AMDGPU::V_FMAMK_F32)
+ : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
if (pseudoToMCOpcode(NewOpc) != -1) {
MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
@@ -4526,6 +4547,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_LEGACY_F32_e64:
case AMDGPU::V_FMAC_F16_e64:
+ case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
case AMDGPU::V_FMAC_F32_e64:
case AMDGPU::V_FMAC_F64_e64:
@@ -5578,7 +5600,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
- case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
+ case AMDGPU::S_FMAC_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64
+ : AMDGPU::V_FMAC_F16_fake16_e64;
case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 40a20fa9cb15e..ae243ac0aaf5f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3237,6 +3237,14 @@ def : GCNPat <
(V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2)
>;
+let True16Predicate = UseRealTrue16Insts in
+def : GCNPat <
+ (fma (f16 (VOP3NoMods f16:$src0)),
+ (f16 (VOP3NoMods f16:$src1)),
+ (f16 (VOP3NoMods f16:$src2))),
+ (V_FMAC_F16_t16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
+ SRCMODS.NONE, $src2)
+>;
let True16Predicate = UseFakeTrue16Insts in
def : GCNPat <
(fma (f16 (VOP3NoMods f16:$src0)),
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 979812e07fc3f..f03cde455f295 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -455,9 +455,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
break;
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
+ NewOpcode = AMDGPU::V_FMAAK_F16;
+ break;
+ case AMDGPU::V_FMA_F16_gfx9_t16_e64:
+ NewOpcode = AMDGPU::V_FMAAK_F16_t16;
+ break;
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
- NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
- : AMDGPU::V_FMAAK_F16;
+ NewOpcode = AMDGPU::V_FMAAK_F16_fake16;
break;
}
}
@@ -485,9 +489,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
break;
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
+ NewOpcode = AMDGPU::V_FMAMK_F16;
+ break;
+ case AMDGPU::V_FMA_F16_gfx9_t16_e64:
+ NewOpcode = AMDGPU::V_FMAMK_F16_t16;
+ break;
case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
- NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
- : AMDGPU::V_FMAMK_F16;
+ NewOpcode = AMDGPU::V_FMAMK_F16_fake16;
break;
}
}
@@ -959,6 +967,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 ||
+ MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 ||
MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) {
shrinkMadFma(MI);
continue;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
index 99e6c5d06a0e1..0b09cabf25a16 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
@@ -3,7 +3,8 @@
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define float @v_fma_f32(float %x, float %y, float %z) {
; GFX6-LABEL: v_fma_f32:
@@ -107,11 +108,18 @@ define half @v_fma_f16(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fma_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fma_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fma_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fma = call half @llvm.fma.f16(half %x, half %y, half %z)
ret half %fma
}
@@ -145,11 +153,17 @@ define half @v_fma_f16_fneg_lhs(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, -v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fma_f16_fneg_lhs:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v0, -v0, v1, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fma_f16_fneg_lhs:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fma_f16_fneg_lhs:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_fma_f16 v0, -v0, v1, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%neg.x = fneg half %x
%fma = call half @llvm.fma.f16(half %neg.x, half %y, half %z)
ret half %fma
@@ -184,11 +198,17 @@ define half @v_fma_f16_fneg_rhs(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, v0, -v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fma_f16_fneg_rhs:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v0, v0, -v1, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fma_f16_fneg_rhs:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, -v1.l, v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fma_f16_fneg_rhs:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, -v1, v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%neg.y = fneg half %y
%fma = call half @llvm.fma.f16(half %x, half %neg.y, half %z)
ret half %fma
@@ -223,11 +243,17 @@ define half @v_fma_f16_fneg_add(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, v0, v1, -v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fma_f16_fneg_add:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v0, v0, v1, -v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fma_f16_fneg_add:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, -v2.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fma_f16_fneg_add:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, -v2
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%neg.z = fneg half %z
%fma = call half @llvm.fma.f16(half %x, half %y, half %neg.z)
ret half %fma
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
index ac7944f25fe37..23e4b80b61f69 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s
+# FIXME-TRUE16. reenable after fix-sgpr-copies is fixed for true16 flow
+# XUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,FAKE16 %s
---
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index 822d40f7349b0..30a8f5733d14b 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -3,8 +3,10 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9,GFX9-GISEL
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-SDAG-FAKE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-GISEL-TRUE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-GISEL-FAKE16
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL
@@ -24,11 +26,34 @@ define half @test_fma(half %x, half %y, half %z) {
; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: test_fma:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_fma:
+; GFX11-SDAG-TRUE16: ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v0.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_fma:
+; GFX11-SDAG-FAKE16: ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_fma:
+; GFX11-GISEL-TRUE16: ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_fma:
+; GFX11-GISEL-FAKE16: ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_fma:
; GFX12: ; %bb.0:
@@ -57,11 +82,31 @@ define half @test_fmac(half %x, half %y, half %z) {
; GFX10-NEXT: v_fmac_f16_e32 v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: test_fmac:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fmac_f16_e32 v0, v1, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_fmac:
+; GFX11-SDAG-TRUE16: ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_fmac:
+; GFX11-SDAG-FAKE16: ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v0, v1, v2
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_fmac:
+; GFX11-GISEL-TRUE16: ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_fmac:
+; GFX11-GISEL-FAKE16: ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v0, v1, v2
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_fmac:
; GFX12: ; %bb.0:
@@ -98,11 +143,31 @@ define half @test_fmaak(half %x, half %y, half %z) {
; GFX10-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: test_fmaak:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_fmaak:
+; GFX11-SDAG-TRUE16: ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_fmaak_f16 v0.l, v0.l, v0.h, 0x4200
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_fmaak:
+; GFX11-SDAG-FAKE16: ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_fmaak:
+; GFX11-GISEL-TRUE16: ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_fmaak_f16 v0.l, v0.l, v1.l, 0x4200
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_fmaak:
+; GFX11-GISEL-FAKE16: ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_fmaak:
; GFX12: ; %bb.0:
@@ -139,11 +204,33 @@ define half @test_fmamk(half %x, half %y, half %z) {
; GFX10-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: test_fmamk:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_fmamk:
+; GFX11-SDAG-TRUE16: ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_fmamk_f16 v0.l, v0.l, 0x4200, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_fmamk:
+; GFX11-SDAG-FAKE16: ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_fmamk:
+; GFX11-GISEL-TRUE16: ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v2.l, 0x4200, v0.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_fmamk:
+; GFX11-GISEL-FAKE16: ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_fmamk:
; GFX12: ; %bb.0:
@@ -208,33 +295,61 @@ define i32 @test_D139469_f16(half %arg) {
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: test_D139469_f16:
-; GFX11-SDAG: ; %bb.0: ; %bb
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x211e
-; GFX11-SDAG-NEXT: v_mul_f16_e32 v2, 0x291e, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
-; GFX11-SDAG-NEXT: v_min_f16_e32 v0, v2, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: test_D139469_f16:
-; GFX11-GISEL: ; %bb.0: ; %bb
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e
-; GFX11-GISEL-NEXT: v_mul_f16_e32 v2, 0x291e, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
-; GFX11-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v1
-; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_D139469_f16:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e
+; GFX11-SDAG-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_D139469_f16:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e
+; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
+; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v0, v2, v1
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_D139469_f16:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e
+; GFX11-GISEL-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.h
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_D139469_f16:
+; GFX11-GISEL-FAKE16: ; %bb.0: ; %bb
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e
+; GFX11-GISEL-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-LABEL: test_D139469_f16:
; GFX12-SDAG: ; %bb.0: ; %bb
@@ -346,44 +461,83 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-LABEL: test_D139469_v2f16:
-; GFX11-SDAG: ; %bb.0: ; %bb
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x211e
-; GFX11-SDAG-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
-; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_pk_min_f16 v0, v1, v0
-; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: test_D139469_v2f16:
-; GFX11-GISEL: ; %bb.0: ; %bb
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e211e
-; GFX11-GISEL-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v0
-; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s1, 0, v1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s2, 0, v3
-; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-GISEL-NEXT: s_or_b32 s0, s1, s2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_D139469_v2f16:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x211e
+; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
+; GFX11-SDAG-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v0, v1, v0
+; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_D139469_v2f16:
+; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x211e
+; GFX11-SDAG-FAKE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
+; GFX11-SDAG-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1
+; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_D139469_v2f16:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x211e211e
+; GFX11-GISEL-TRUE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
+; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.l
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3.l
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, s1, s2
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_D139469_v2f16:
+; GFX11-GISEL-FAKE16: ; %bb.0: ; %bb
+; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e211e
+; GFX11-GISEL-FAKE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
+; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, s1, s2
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-LABEL: test_D139469_v2f16:
; GFX12-SDAG: ; %bb.0: ; %bb
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-fake16.mir b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-fake16.mir
new file mode 100644
index 0000000000000..d551ad88f56b7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-fake16.mir
@@ -0,0 +1,242 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX11
+
+---
+name: mad_cvv_f32
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mad_cvv_f32
+ ; GFX11: $vgpr0 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec
+ ; GFX11-NEXT: SI_RETURN implicit $vgpr2
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_MAD_F32_e64 0, 1092616192, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: mad_vcv_f32
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mad_vcv_f32
+ ; GFX11: $vgpr0 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec
+ ; GFX11-NEXT: SI_RETURN implicit $vgpr2
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, 1092616192, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: mad_vvc_f32
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mad_vvc_f32
+ ; GFX11: $vgpr0 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec
+ ; GFX11-NEXT: SI_RETURN implicit $vgpr2
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: mad_vsc_f32
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mad_vsc_f32
+ ; GFX11: $vgpr0 = IMPLICIT_DEF
+ ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec
+ ; GFX11-NEXT: SI_RETURN implicit $vgpr2
+ $vgpr0 = IMPLICIT_DEF
+ $sgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: fma_cvv_f32
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: fma_cvv_f32
+ ; GFX11: $vgpr0 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec
+ ; GFX11-NEXT: SI_RETURN implicit $vgpr2
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_FMA_F32_e64 0, 1092616192, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: fma_vcv_f32
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: fma_vcv_f32
+ ; GFX11: $vgpr0 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec
+ ; GFX11-NEXT: SI_RETURN implicit $vgpr2
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, 1092616192, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: fma_vvc_f32
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: fma_vvc_f32
+ ; GFX11: $vgpr0 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec
+ ; GFX11-NEXT: SI_RETURN implicit $vgpr2
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: fma_vsc_f32
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: fma_vsc_f32
+ ; GFX11: $vgpr0 = IMPLICIT_DEF
+ ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec
+ ; GFX11-NEXT: SI_RETURN implicit $vgpr2
+ $vgpr0 = IMPLICIT_DEF
+ $sgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: mad_cvv_f16
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mad_cvv_f16
+ ; GFX11: $vgpr0 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
+ ; GFX11-NEXT: SI_RETURN implicit $vgpr2
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_MAD_F16_e64 0, 18688, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: mad_vcv_f16
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mad_vcv_f16
+ ; GFX11: $vgpr0 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
+ ; GFX11-NEXT: SI_RETURN implicit $vgpr2
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, 18688, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: mad_vvc_f16
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mad_vvc_f16
+ ; GFX11: $vgpr0 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
+ ; GFX11-NEXT: SI_RETURN implicit $vgpr2
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: mad_vsc_f16
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mad_vsc_f16
+ ; GFX11: $vgpr0 = IMPLICIT_DEF
+ ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
+ ; GFX11-NEXT: SI_RETURN implicit $vgpr2
+ $vgpr0 = IMPLICIT_DEF
+ $sgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: fma_cvv_f16
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: fma_cvv_f16
+ ; GFX11: $vgpr0 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_fake16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
+ ; GFX11-NEXT: SI_RETURN implicit $vgpr2
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_FMA_F16_gfx9_fake16_e64 0, 18688, 0, $vgpr0, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: fma_vcv_f16
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: fma_vcv_f16
+ ; GFX11: $vgpr0 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_fake16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
+ ; GFX11-NEXT: SI_RETURN implicit $vgpr2
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_FMA_F16_gfx9_fake16_e64 0, $vgpr0, 0, 18688, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: fma_vvc_f16
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: fma_vvc_f16
+ ; GFX11: $vgpr0 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_fake16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
+ ; GFX11-NEXT: SI_RETURN implicit $vgpr2
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_FMA_F16_gfx9_fake16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: fma_vsc_f16
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: fma_vsc_f16
+ ; GFX11: $vgpr0 = IMPLICIT_DEF
+ ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF
+ ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_fake16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
+ ; GFX11-NEXT: SI_RETURN implicit $vgpr2
+ $vgpr0 = IMPLICIT_DEF
+ $sgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_FMA_F16_gfx9_fake16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-gfx10.mir b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-gfx10.mir
new file mode 100644
index 0000000000000..89ef5df9beb8e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-gfx10.mir
@@ -0,0 +1,258 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX10
+
+---
+name: mad_cvv_f32
+body: |
+ bb.0:
+ ; GFX10-LABEL: name: mad_cvv_f32
+ ; GFX10: $vgpr0 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec
+ ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_MAD_F32_e64 0, 1092616192, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: mad_vcv_f32
+body: |
+ bb.0:
+ ; GFX10-LABEL: name: mad_vcv_f32
+ ; GFX10: $vgpr0 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec
+ ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, 1092616192, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: mad_vvc_f32
+body: |
+ bb.0:
+ ; GFX10-LABEL: name: mad_vvc_f32
+ ; GFX10: $vgpr0 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec
+ ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: mad_vsc_f32
+body: |
+ bb.0:
+ ; GFX10-LABEL: name: mad_vsc_f32
+ ; GFX10: $vgpr0 = IMPLICIT_DEF
+ ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec
+ ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $sgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: fma_cvv_f32
+body: |
+ bb.0:
+ ; GFX10-LABEL: name: fma_cvv_f32
+ ; GFX10: $vgpr0 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec
+ ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_FMA_F32_e64 0, 1092616192, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: fma_vcv_f32
+body: |
+ bb.0:
+ ; GFX10-LABEL: name: fma_vcv_f32
+ ; GFX10: $vgpr0 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec
+ ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, 1092616192, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: fma_vvc_f32
+body: |
+ bb.0:
+ ; GFX10-LABEL: name: fma_vvc_f32
+ ; GFX10: $vgpr0 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec
+ ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: fma_vsc_f32
+body: |
+ bb.0:
+ ; GFX10-LABEL: name: fma_vsc_f32
+ ; GFX10: $vgpr0 = IMPLICIT_DEF
+ ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec
+ ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $sgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: mad_cvv_f16
+body: |
+ bb.0:
+ ; GFX10-LABEL: name: mad_cvv_f16
+ ; GFX10: $vgpr0 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
+ ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_MAD_F16_e64 0, 18688, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: mad_vcv_f16
+body: |
+ bb.0:
+ ; GFX10-LABEL: name: mad_vcv_f16
+ ; GFX10: $vgpr0 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
+ ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, 18688, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: mad_vvc_f16
+body: |
+ bb.0:
+ ; GFX10-LABEL: name: mad_vvc_f16
+ ; GFX10: $vgpr0 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
+ ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: mad_vsc_f16
+body: |
+ bb.0:
+ ; GFX10-LABEL: name: mad_vsc_f16
+ ; GFX10: $vgpr0 = IMPLICIT_DEF
+ ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
+ ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $sgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: fma_cvv_f16
+body: |
+ bb.0:
+ ; GFX10-LABEL: name: fma_cvv_f16
+ ; GFX10: $vgpr0 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr2 = V_FMAMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
+ ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_FMA_F16_gfx9_e64 0, 18688, 0, $vgpr0, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: fma_vcv_f16
+body: |
+ bb.0:
+ ; GFX10-LABEL: name: fma_vcv_f16
+ ; GFX10: $vgpr0 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr2 = V_FMAMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
+ ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, 18688, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: fma_vvc_f16
+body: |
+ bb.0:
+ ; GFX10-LABEL: name: fma_vvc_f16
+ ; GFX10: $vgpr0 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr2 = V_FMAAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
+ ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
+
+---
+name: fma_vsc_f16
+body: |
+ bb.0:
+ ; GFX10-LABEL: name: fma_vsc_f16
+ ; GFX10: $vgpr0 = IMPLICIT_DEF
+ ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF
+ ; GFX10-NEXT: $vgpr2 = V_FMAAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
+ ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $sgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec
+ SI_RETURN implicit $vgpr2
+...
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir
index 26feb8120c751..c9138dda7d1a7 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir
+++ b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir
@@ -1,17 +1,10 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX10
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX11
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX11
---
name: mad_cvv_f32
body: |
bb.0:
- ; GFX10-LABEL: name: mad_cvv_f32
- ; GFX10: $vgpr0 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec
- ; GFX10-NEXT: SI_RETURN implicit $vgpr2
- ;
; GFX11-LABEL: name: mad_cvv_f32
; GFX11: $vgpr0 = IMPLICIT_DEF
; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -27,12 +20,6 @@ body: |
name: mad_vcv_f32
body: |
bb.0:
- ; GFX10-LABEL: name: mad_vcv_f32
- ; GFX10: $vgpr0 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec
- ; GFX10-NEXT: SI_RETURN implicit $vgpr2
- ;
; GFX11-LABEL: name: mad_vcv_f32
; GFX11: $vgpr0 = IMPLICIT_DEF
; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -48,12 +35,6 @@ body: |
name: mad_vvc_f32
body: |
bb.0:
- ; GFX10-LABEL: name: mad_vvc_f32
- ; GFX10: $vgpr0 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec
- ; GFX10-NEXT: SI_RETURN implicit $vgpr2
- ;
; GFX11-LABEL: name: mad_vvc_f32
; GFX11: $vgpr0 = IMPLICIT_DEF
; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -69,12 +50,6 @@ body: |
name: mad_vsc_f32
body: |
bb.0:
- ; GFX10-LABEL: name: mad_vsc_f32
- ; GFX10: $vgpr0 = IMPLICIT_DEF
- ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec
- ; GFX10-NEXT: SI_RETURN implicit $vgpr2
- ;
; GFX11-LABEL: name: mad_vsc_f32
; GFX11: $vgpr0 = IMPLICIT_DEF
; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF
@@ -90,12 +65,6 @@ body: |
name: fma_cvv_f32
body: |
bb.0:
- ; GFX10-LABEL: name: fma_cvv_f32
- ; GFX10: $vgpr0 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec
- ; GFX10-NEXT: SI_RETURN implicit $vgpr2
- ;
; GFX11-LABEL: name: fma_cvv_f32
; GFX11: $vgpr0 = IMPLICIT_DEF
; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -111,12 +80,6 @@ body: |
name: fma_vcv_f32
body: |
bb.0:
- ; GFX10-LABEL: name: fma_vcv_f32
- ; GFX10: $vgpr0 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec
- ; GFX10-NEXT: SI_RETURN implicit $vgpr2
- ;
; GFX11-LABEL: name: fma_vcv_f32
; GFX11: $vgpr0 = IMPLICIT_DEF
; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -132,12 +95,6 @@ body: |
name: fma_vvc_f32
body: |
bb.0:
- ; GFX10-LABEL: name: fma_vvc_f32
- ; GFX10: $vgpr0 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec
- ; GFX10-NEXT: SI_RETURN implicit $vgpr2
- ;
; GFX11-LABEL: name: fma_vvc_f32
; GFX11: $vgpr0 = IMPLICIT_DEF
; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -153,12 +110,6 @@ body: |
name: fma_vsc_f32
body: |
bb.0:
- ; GFX10-LABEL: name: fma_vsc_f32
- ; GFX10: $vgpr0 = IMPLICIT_DEF
- ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec
- ; GFX10-NEXT: SI_RETURN implicit $vgpr2
- ;
; GFX11-LABEL: name: fma_vsc_f32
; GFX11: $vgpr0 = IMPLICIT_DEF
; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF
@@ -174,12 +125,6 @@ body: |
name: mad_cvv_f16
body: |
bb.0:
- ; GFX10-LABEL: name: mad_cvv_f16
- ; GFX10: $vgpr0 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
- ; GFX10-NEXT: SI_RETURN implicit $vgpr2
- ;
; GFX11-LABEL: name: mad_cvv_f16
; GFX11: $vgpr0 = IMPLICIT_DEF
; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -195,12 +140,6 @@ body: |
name: mad_vcv_f16
body: |
bb.0:
- ; GFX10-LABEL: name: mad_vcv_f16
- ; GFX10: $vgpr0 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
- ; GFX10-NEXT: SI_RETURN implicit $vgpr2
- ;
; GFX11-LABEL: name: mad_vcv_f16
; GFX11: $vgpr0 = IMPLICIT_DEF
; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -216,12 +155,6 @@ body: |
name: mad_vvc_f16
body: |
bb.0:
- ; GFX10-LABEL: name: mad_vvc_f16
- ; GFX10: $vgpr0 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
- ; GFX10-NEXT: SI_RETURN implicit $vgpr2
- ;
; GFX11-LABEL: name: mad_vvc_f16
; GFX11: $vgpr0 = IMPLICIT_DEF
; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -237,12 +170,6 @@ body: |
name: mad_vsc_f16
body: |
bb.0:
- ; GFX10-LABEL: name: mad_vsc_f16
- ; GFX10: $vgpr0 = IMPLICIT_DEF
- ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
- ; GFX10-NEXT: SI_RETURN implicit $vgpr2
- ;
; GFX11-LABEL: name: mad_vsc_f16
; GFX11: $vgpr0 = IMPLICIT_DEF
; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF
@@ -258,20 +185,14 @@ body: |
name: fma_cvv_f16
body: |
bb.0:
- ; GFX10-LABEL: name: fma_cvv_f16
- ; GFX10: $vgpr0 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr2 = V_FMAMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
- ; GFX10-NEXT: SI_RETURN implicit $vgpr2
- ;
; GFX11-LABEL: name: fma_cvv_f16
; GFX11: $vgpr0 = IMPLICIT_DEF
; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
- ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_fake16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
+ ; GFX11-NEXT: $vgpr2_lo16 = V_FMAMK_F16_t16 $vgpr0_lo16, 18688, $vgpr1_lo16, implicit $mode, implicit $exec
; GFX11-NEXT: SI_RETURN implicit $vgpr2
$vgpr0 = IMPLICIT_DEF
$vgpr1 = IMPLICIT_DEF
- $vgpr2 = V_FMA_F16_gfx9_e64 0, 18688, 0, $vgpr0, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr2_lo16 = V_FMA_F16_gfx9_t16_e64 0, 18688, 0, $vgpr0_lo16, 0, $vgpr1_lo16, 0, 0, 0, implicit $mode, implicit $exec
SI_RETURN implicit $vgpr2
...
@@ -279,20 +200,14 @@ body: |
name: fma_vcv_f16
body: |
bb.0:
- ; GFX10-LABEL: name: fma_vcv_f16
- ; GFX10: $vgpr0 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr2 = V_FMAMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
- ; GFX10-NEXT: SI_RETURN implicit $vgpr2
- ;
; GFX11-LABEL: name: fma_vcv_f16
; GFX11: $vgpr0 = IMPLICIT_DEF
; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
- ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_fake16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
+ ; GFX11-NEXT: $vgpr2_lo16 = V_FMAMK_F16_t16 $vgpr0_lo16, 18688, $vgpr1_lo16, implicit $mode, implicit $exec
; GFX11-NEXT: SI_RETURN implicit $vgpr2
$vgpr0 = IMPLICIT_DEF
$vgpr1 = IMPLICIT_DEF
- $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, 18688, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr2_lo16 = V_FMA_F16_gfx9_t16_e64 0, $vgpr0_lo16, 0, 18688, 0, $vgpr1_lo16, 0, 0, 0, implicit $mode, implicit $exec
SI_RETURN implicit $vgpr2
...
@@ -300,20 +215,14 @@ body: |
name: fma_vvc_f16
body: |
bb.0:
- ; GFX10-LABEL: name: fma_vvc_f16
- ; GFX10: $vgpr0 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr2 = V_FMAAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
- ; GFX10-NEXT: SI_RETURN implicit $vgpr2
- ;
; GFX11-LABEL: name: fma_vvc_f16
; GFX11: $vgpr0 = IMPLICIT_DEF
; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
- ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_fake16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
+ ; GFX11-NEXT: $vgpr2_lo16 = V_FMAAK_F16_t16 $vgpr0_lo16, $vgpr1_lo16, 18688, implicit $mode, implicit $exec
; GFX11-NEXT: SI_RETURN implicit $vgpr2
$vgpr0 = IMPLICIT_DEF
$vgpr1 = IMPLICIT_DEF
- $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr2_lo16 = V_FMA_F16_gfx9_t16_e64 0, $vgpr0_lo16, 0, $vgpr1_lo16, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec
SI_RETURN implicit $vgpr2
...
@@ -321,19 +230,13 @@ body: |
name: fma_vsc_f16
body: |
bb.0:
- ; GFX10-LABEL: name: fma_vsc_f16
- ; GFX10: $vgpr0 = IMPLICIT_DEF
- ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF
- ; GFX10-NEXT: $vgpr2 = V_FMAAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
- ; GFX10-NEXT: SI_RETURN implicit $vgpr2
- ;
; GFX11-LABEL: name: fma_vsc_f16
; GFX11: $vgpr0 = IMPLICIT_DEF
; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF
- ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_fake16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
+ ; GFX11-NEXT: $vgpr2_lo16 = V_FMAAK_F16_t16 $vgpr0_hi16, $vgpr1_hi16, 18688, implicit $mode, implicit $exec
; GFX11-NEXT: SI_RETURN implicit $vgpr2
$vgpr0 = IMPLICIT_DEF
$sgpr1 = IMPLICIT_DEF
- $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr2_lo16 = V_FMA_F16_gfx9_t16_e64 0, $vgpr0_hi16, 0, $vgpr1_hi16, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec
SI_RETURN implicit $vgpr2
...
>From 583ac5c453bbc6545dfbc51a5e6f2e6817d8466c Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Mon, 27 Jan 2025 11:26:31 -0500
Subject: [PATCH 2/3] tmp
---
.../lib/CodeGen/TwoAddressInstructionPass.cpp | 41 ++++++++++++++++++-
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 9 ++++
2 files changed, 48 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index fb6274b09919b..9009bbe49a9af 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -529,24 +529,34 @@ MachineInstr *TwoAddressInstructionImpl::findOnlyInterestingUse(
/// to.
static MCRegister getMappedReg(Register Reg,
DenseMap<Register, Register> &RegMap) {
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
while (Reg.isVirtual()) {
DenseMap<Register, Register>::iterator SI = RegMap.find(Reg);
- if (SI == RegMap.end())
+ if (SI == RegMap.end()) {
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
return 0;
+ }
Reg = SI->second;
}
- if (Reg.isPhysical())
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
+ if (Reg.isPhysical()) {
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
return Reg;
+ }
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
return 0;
}
/// Return true if the two registers are equal or aliased.
bool TwoAddressInstructionImpl::regsAreCompatible(Register RegA,
Register RegB) const {
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
if (RegA == RegB)
return true;
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
if (!RegA || !RegB)
return false;
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
return TRI->regsOverlap(RegA, RegB);
}
@@ -774,10 +784,14 @@ bool TwoAddressInstructionImpl::isProfitableToConv3Addr(Register RegA,
// %reg1026 = ADD %reg1024, %reg1025
// r2 = MOV %reg1026
// Turn ADD into a 3-address instruction to avoid a copy.
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
MCRegister FromRegB = getMappedReg(RegB, SrcRegMap);
if (!FromRegB)
return false;
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
MCRegister ToRegA = getMappedReg(RegA, DstRegMap);
+
+ LLVM_DEBUG(dbgs() << ToRegA << "\n");
return (ToRegA && !regsAreCompatible(FromRegB, ToRegA));
}
@@ -831,6 +845,7 @@ void TwoAddressInstructionImpl::scanUses(Register DstReg) {
bool IsCopy = false;
Register NewReg;
Register Reg = DstReg;
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
while (MachineInstr *UseMI =
findOnlyInterestingUse(Reg, MBB, IsCopy, NewReg, IsDstPhys)) {
if (IsCopy && !Processed.insert(UseMI).second)
@@ -842,6 +857,7 @@ void TwoAddressInstructionImpl::scanUses(Register DstReg) {
break;
if (IsDstPhys) {
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
VirtRegPairs.push_back(NewReg);
break;
}
@@ -850,10 +866,12 @@ void TwoAddressInstructionImpl::scanUses(Register DstReg) {
Reg = NewReg;
}
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
if (!VirtRegPairs.empty()) {
unsigned ToReg = VirtRegPairs.back();
VirtRegPairs.pop_back();
while (!VirtRegPairs.empty()) {
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
unsigned FromReg = VirtRegPairs.pop_back_val();
bool isNew = DstRegMap.insert(std::make_pair(FromReg, ToReg)).second;
if (!isNew)
@@ -864,6 +882,7 @@ void TwoAddressInstructionImpl::scanUses(Register DstReg) {
if (!isNew)
assert(DstRegMap[DstReg] == ToReg && "Can't map to two dst registers!");
}
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
}
/// If the specified instruction is not yet processed, process it if it's a
@@ -1312,9 +1331,11 @@ bool TwoAddressInstructionImpl::tryInstructionCommute(MachineInstr *MI,
bool TwoAddressInstructionImpl::tryInstructionTransform(
MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi,
unsigned SrcIdx, unsigned DstIdx, unsigned &Dist, bool shouldOnlyCommute) {
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
if (OptLevel == CodeGenOptLevel::None)
return false;
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
MachineInstr &MI = *mi;
Register regA = MI.getOperand(DstIdx).getReg();
Register regB = MI.getOperand(SrcIdx).getReg();
@@ -1325,6 +1346,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
if (regA.isVirtual())
scanUses(regA);
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
// If the instruction is convertible to 3 Addr, instead
@@ -1355,11 +1377,18 @@ bool TwoAddressInstructionImpl::tryInstructionTransform(
regBKilled = isKilled(MI, regB, true);
}
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
if (MI.isConvertibleTo3Addr()) {
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
// This instruction is potentially convertible to a true
// three-address instruction. Check if it is profitable.
+ LLVM_DEBUG(dbgs() << regBKilled << "\n");
+ LLVM_DEBUG(dbgs() << regA << "\n");
+ LLVM_DEBUG(dbgs() << regB << "\n");
+ LLVM_DEBUG(dbgs() << isProfitableToConv3Addr(regA, regB) << "\n");
if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
// Try to convert it.
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
if (convertInstTo3Addr(mi, nmi, regA, regB, Dist)) {
++NumConvertedTo3Addr;
return true; // Done with this instruction.
@@ -1874,21 +1903,29 @@ bool TwoAddressInstructionImpl::run() {
++NumTwoAddressInstrs;
MadeChange = true;
+
+ LLVM_DEBUG(dbgs() << "tiedoperands.size" << TiedOperands.size() << "\n");
LLVM_DEBUG(dbgs() << '\t' << *mi);
// If the instruction has a single pair of tied operands, try some
// transformations that may either eliminate the tied operands or
// improve the opportunities for coalescing away the register copy.
if (TiedOperands.size() == 1) {
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
SmallVectorImpl<std::pair<unsigned, unsigned>> &TiedPairs
= TiedOperands.begin()->second;
+ LLVM_DEBUG(dbgs() << "TiedPairs.size" << TiedPairs.size() << "\n");
if (TiedPairs.size() == 1) {
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
unsigned SrcIdx = TiedPairs[0].first;
unsigned DstIdx = TiedPairs[0].second;
Register SrcReg = mi->getOperand(SrcIdx).getReg();
Register DstReg = mi->getOperand(DstIdx).getReg();
+ LLVM_DEBUG(dbgs() << SrcReg << "\n");
+ LLVM_DEBUG(dbgs() << DstReg << "\n");
if (SrcReg != DstReg &&
tryInstructionTransform(mi, nmi, SrcIdx, DstIdx, Dist, false)) {
+ LLVM_DEBUG(dbgs() << __FILE__ << __LINE__ << "\n");
// The tied operands have been eliminated or shifted further down
// the block to ease elimination. Continue processing with 'nmi'.
TiedOperands.clear();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index aa3cff6b9442c..cf895bae7a787 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3910,6 +3910,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
MachineBasicBlock &MBB = *MI.getParent();
unsigned Opc = MI.getOpcode();
+ LLVM_DEBUG(dbgs() << __FILE__ <<__LINE__ << "\n");
+
// Handle MFMA.
int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
if (NewMFMAOpc != -1) {
@@ -3943,6 +3945,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return MIB;
}
+ LLVM_DEBUG(dbgs() << __FILE__ <<__LINE__ << "\n");
if (SIInstrInfo::isWMMA(MI)) {
unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
@@ -3957,6 +3960,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return MIB;
}
+ LLVM_DEBUG(dbgs() << __FILE__ <<__LINE__ << "\n");
assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
"V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
@@ -3982,6 +3986,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
bool Src0Literal = false;
+ LLVM_DEBUG(dbgs() << __FILE__ <<__LINE__ << "\n");
switch (Opc) {
default:
return nullptr;
@@ -4015,6 +4020,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
}
}
+ LLVM_DEBUG(dbgs() << __FILE__ <<__LINE__ << "\n");
MachineInstrBuilder MIB;
const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
@@ -4070,6 +4076,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
}
};
+ LLVM_DEBUG(dbgs() << __FILE__ <<__LINE__ << "\n");
int64_t Imm;
if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
unsigned NewOpc =
@@ -4094,6 +4101,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return MIB;
}
}
+ LLVM_DEBUG(dbgs() << __FILE__ <<__LINE__ << "\n");
unsigned NewOpc = IsFMA
? (IsF16 ? (ST.hasTrue16BitInsts()
? ST.useRealTrue16Insts()
@@ -4143,6 +4151,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
}
}
}
+ LLVM_DEBUG(dbgs() << __FILE__ <<__LINE__ << "\n");
// VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
// if VOP3 does not allow a literal operand.
>From 845ff5aab553f31d0df1fa2c8cda25391d276b67 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Thu, 13 Feb 2025 10:52:06 -0500
Subject: [PATCH 3/3] correct comment
---
llvm/lib/CodeGen/VirtRegMap.cpp | 5 +++++
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +-
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp
index b3a7acc15b3dc..b905ecd8949e2 100644
--- a/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -602,7 +602,9 @@ void VirtRegRewriter::rewrite() {
MBBI != MBBE; ++MBBI) {
LLVM_DEBUG(MBBI->print(dbgs(), Indexes));
for (MachineInstr &MI : llvm::make_early_inc_range(MBBI->instrs())) {
+ LLVM_DEBUG(dbgs() << __LINE__ << "MI:" << MI << "\n");
for (MachineOperand &MO : MI.operands()) {
+ LLVM_DEBUG(dbgs() << __LINE__ << "MO:" << MO << "\n");
// Make sure MRI knows about registers clobbered by regmasks.
if (MO.isRegMask())
MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());
@@ -610,7 +612,9 @@ void VirtRegRewriter::rewrite() {
if (!MO.isReg() || !MO.getReg().isVirtual())
continue;
Register VirtReg = MO.getReg();
+ LLVM_DEBUG(dbgs() << __LINE__ << "VirtReg:" << VirtReg << "\n");
MCRegister PhysReg = VRM->getPhys(VirtReg);
+ LLVM_DEBUG(dbgs() << __LINE__ << "PhysReg:" << PhysReg << "\n");
if (!PhysReg)
continue;
@@ -624,6 +628,7 @@ void VirtRegRewriter::rewrite() {
// Preserve semantics of sub-register operands.
unsigned SubReg = MO.getSubReg();
+ LLVM_DEBUG(dbgs() << __LINE__ << "SubReg:" << SubReg << "\n");
if (SubReg != 0) {
if (NoSubRegLiveness || !MRI->shouldTrackSubRegLiveness(VirtReg)) {
// A virtual register kill refers to the whole register, so we may
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index cf895bae7a787..0fed4773b067d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3689,7 +3689,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
- // V_FMAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
+ // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
// takes VGPR_32_Lo128 operands, so the rewrite would also require
// restricting their register classes. For now just bail out.
if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
More information about the llvm-commits
mailing list