[llvm] [AMDGPU][True16][CodeGen] Support AND/OR/XOR and LDEXP True16 format (PR #102620)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Aug 10 11:21:33 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Brox Chen (broxigarchen)
<details>
<summary>Changes</summary>
Support AND/OR/XOR true16 and LDEXP true/fake16 format.
These instructions are previously implemented with fake16 profile. Fixing the implementation.
Added a RA hint so that when using 16bit register in a 32bit instruction, try to use the register directly without an extra 16bit move
---
Patch is 72.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/102620.diff
12 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+38-12)
- (modified) llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp (+40)
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+20)
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp (+67)
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.h (+12)
- (modified) llvm/lib/Target/AMDGPU/VOP1Instructions.td (+39-2)
- (modified) llvm/lib/Target/AMDGPU/VOP2Instructions.td (+29-8)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+71-33)
- (modified) llvm/test/CodeGen/AMDGPU/fadd.f16.ll (+2-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll (+2-7)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll (+2-7)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll (+477-224)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index f78699f88de56c..58835f7b35c4c7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -161,18 +161,34 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
// TODO: Skip masking high bits if def is known boolean.
- bool IsSGPR = TRI.isSGPRClass(SrcRC);
- unsigned AndOpc =
- IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
- auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
- .addImm(1)
- .addReg(SrcReg);
- if (IsSGPR)
- And.setOperandDead(3); // Dead scc
-
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
- .addImm(0)
- .addReg(MaskedReg);
+ if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
+ assert(Subtarget->useRealTrue16Insts());
+ const int64_t NoMods = 0;
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
+ .addImm(NoMods)
+ .addImm(1)
+ .addImm(NoMods)
+ .addReg(SrcReg)
+ .addImm(NoMods);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
+ .addImm(NoMods)
+ .addImm(0)
+ .addImm(NoMods)
+ .addReg(MaskedReg)
+ .addImm(NoMods);
+ } else {
+ bool IsSGPR = TRI.isSGPRClass(SrcRC);
+ unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
+ auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
+ .addImm(1)
+ .addReg(SrcReg);
+ if (IsSGPR)
+ And.setOperandDead(3); // Dead scc
+
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
+ .addImm(0)
+ .addReg(MaskedReg);
+ }
}
if (!MRI->getRegClassOrNull(SrcReg))
@@ -2206,6 +2222,16 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
return false;
}
+ if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
+ assert(STI.useRealTrue16Insts());
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineBasicBlock *MBB = I.getParent();
+ BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
+ .addReg(SrcReg, 0, AMDGPU::lo16);
+ I.eraseFromParent();
+ return true;
+ }
+
if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
MachineBasicBlock *MBB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 4467836cffc566..e34448f2f0b9c4 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -22,11 +22,18 @@
/// although the same shall be possible with other register classes and
/// instructions if necessary.
///
+/// This pass also adds register allocation hints to COPY.
+/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
+/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
+/// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
+/// the VGPR_32, the COPY can be completely eliminated.
+
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIRegisterInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/InitializePasses.h"
@@ -236,5 +243,38 @@ bool GCNPreRAOptimizations::runOnMachineFunction(MachineFunction &MF) {
Changed |= processReg(Reg);
}
+ if (!ST.useRealTrue16Insts())
+ return Changed;
+
+ // Add RA hints to improve True16 COPY elimination.
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ if (MI.getOpcode() != AMDGPU::COPY)
+ continue;
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ if (Dst.isVirtual() &&
+ MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+ Src.isPhysical() &&
+ TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
+ MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
+ if (Src.isVirtual() &&
+ MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
+ Dst.isPhysical() &&
+ TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
+ MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
+ if (!Dst.isVirtual() || !Src.isVirtual())
+ continue;
+ if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
+ MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
+ MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
+ MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
+ }
+ if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+ MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
+ MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
+ }
+ }
+
return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index c41850ab55f75c..5a139d1cf8d825 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2030,6 +2030,8 @@ def : GCNPat <
>;
foreach fp16vt = [f16, bf16] in {
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let SubtargetPredicate = p in {
def : GCNPat <
(fabs (fp16vt VGPR_32:$src)),
(V_AND_B32_e64 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src)
@@ -2044,6 +2046,24 @@ def : GCNPat <
(fneg (fabs (fp16vt VGPR_32:$src))),
(V_OR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit
>;
+}
+
+let SubtargetPredicate = UseRealTrue16Insts in {
+def : GCNPat <
+ (fabs (fp16vt VGPR_16:$src)),
+ (V_AND_B16_t16_e64 (i32 0), (i16 0x7fff), (i32 0), VGPR_16:$src)
+>;
+
+def : GCNPat <
+ (fneg (fp16vt VGPR_16:$src)),
+ (V_XOR_B16_t16_e64 (i32 0), (i16 0x8000), (i32 0), VGPR_16:$src)
+>;
+
+def : GCNPat <
+ (fneg (fabs (fp16vt VGPR_16:$src))),
+ (V_OR_B16_t16_e64 (i32 0), (i16 0x8000), (i32 0), VGPR_16:$src) // Set sign bit
+>;
+} // End SubtargetPredicate = UseRealTrue16Insts
} // End foreach fp16vt = ...
def : GCNPat <
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ee72837a50fc43..4e5fa782345cc3 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3327,6 +3327,73 @@ const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
}
+bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
+ ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF,
+ const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const {
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
+
+ switch (Hint.first) {
+ case AMDGPURI::Size32: {
+ Register Paired = Hint.second;
+ assert(Paired);
+ Register PairedPhys;
+ if (Paired.isPhysical()) {
+ PairedPhys =
+ getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
+ } else if (VRM && VRM->hasPhys(Paired)) {
+ PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
+ &AMDGPU::VGPR_32RegClass);
+ }
+
+ // Prefer the paired physreg.
+ if (PairedPhys)
+ // isLo(Paired) is implicitly true here from the API of
+ // getMatchingSuperReg.
+ Hints.push_back(PairedPhys);
+ return false;
+ }
+ case AMDGPURI::Size16: {
+ Register Paired = Hint.second;
+ assert(Paired);
+ Register PairedPhys;
+ if (Paired.isPhysical()) {
+ PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
+ } else if (VRM && VRM->hasPhys(Paired)) {
+ PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
+ }
+
+ // First prefer the paired physreg.
+ if (PairedPhys)
+ Hints.push_back(PairedPhys);
+ else {
+ // Add all the lo16 physregs.
+ // When the Paired operand has not yet been assigned a physreg it is
+ // better to try putting VirtReg in a lo16 register, because possibly
+ // later Paired can be assigned to the overlapping register and the COPY
+ // can be eliminated.
+ for (MCPhysReg PhysReg : Order) {
+ if (PhysReg == PairedPhys || AMDGPU::isHi(PhysReg, *this))
+ continue;
+ if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
+ !MRI.isReserved(PhysReg))
+ Hints.push_back(PhysReg);
+ }
+ }
+ return false;
+ }
+ default:
+ return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
+ VRM);
+ }
+}
+
MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
// Not a callee saved register.
return AMDGPU::SGPR30_SGPR31;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 88d5686720985e..622e86d03048a4 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -29,6 +29,13 @@ class LiveRegUnits;
class RegisterBank;
struct SGPRSpillBuilder;
+/// Register allocation hint types. Helps eliminate unneeded COPY with True16
+namespace AMDGPURI {
+
+enum { Size16 = 1, Size32 = 2 };
+
+} // end namespace AMDGPURI
+
class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
private:
const GCNSubtarget &ST;
@@ -326,6 +333,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
unsigned getRegPressureSetLimit(const MachineFunction &MF,
unsigned Idx) const override;
+ bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF, const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const override;
+
const int *getRegUnitPressureSets(unsigned RegUnit) const override;
MCRegister getReturnAddressReg(const MachineFunction &MF) const;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 34d12aa5e07835..03e4cb9fcf49b7 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1397,7 +1397,8 @@ def : GCNPat <
} // End OtherPredicates = [isGFX8Plus]
-let OtherPredicates = [isGFX8Plus] in {
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let OtherPredicates = [isGFX8Plus, p] in {
def : GCNPat<
(i32 (anyext i16:$src)),
(COPY $src)
@@ -1420,7 +1421,43 @@ def : GCNPat <
(EXTRACT_SUBREG $src, sub0)
>;
-} // End OtherPredicates = [isGFX8Plus]
+} // End OtherPredicates = [isGFX8Plus, p]
+
+let OtherPredicates = [UseFakeTrue16Insts] in {
+def : GCNPat<
+ (i32 (DivergentUnaryFrag<anyext> i16:$src)),
+ (COPY $src)
+>;
+} // End OtherPredicates = [UseFakeTrue16Insts]
+
+
+let OtherPredicates = [UseRealTrue16Insts] in {
+def : GCNPat<
+ (i32 (UniformUnaryFrag<anyext> (i16 SReg_32:$src))),
+ (COPY $src)
+>;
+
+def : GCNPat<
+ (i32 (DivergentUnaryFrag<anyext> i16:$src)),
+ (REG_SEQUENCE VGPR_32, $src, lo16, (i16 (IMPLICIT_DEF)), hi16)
+>;
+
+def : GCNPat<
+ (i64 (anyext i16:$src)),
+ (REG_SEQUENCE VReg_64, $src, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
+>;
+
+def : GCNPat<
+ (i16 (trunc i32:$src)),
+ (EXTRACT_SUBREG $src, lo16)
+>;
+
+def : GCNPat <
+ (i16 (trunc i64:$src)),
+ (EXTRACT_SUBREG $src, lo16)
+>;
+
+} // End OtherPredicates = [UseRealTrue16Insts]
//===----------------------------------------------------------------------===//
// GFX9
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index d17b4f24081312..a9ebbb1a1886fa 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -922,18 +922,25 @@ def LDEXP_F16_VOPProfile : VOPProfile <[f16, f16, f16, untyped]> {
let HasSrc1FloatMods = 0;
let Src1ModSDWA = Int16SDWAInputMods;
}
-def LDEXP_F16_VOPProfile_True16 : VOPProfile_Fake16<VOP_F16_F16_F16> {
+def LDEXP_F16_VOPProfile_True16 : VOPProfile_True16<VOP_F16_F16_F16> {
+ let Src1RC32 = RegisterOperand<VGPR_16_Lo128>;
+ let Src1DPP = RegisterOperand<VGPR_16_Lo128>;
+ let Src1ModDPP = IntT16VRegInputMods<0/*IsFake16*/>;
+}
+def LDEXP_F16_VOPProfile_Fake16 : VOPProfile_Fake16<VOP_F16_F16_F16> {
let Src1RC32 = RegisterOperand<VGPR_32_Lo128>;
let Src1DPP = RegisterOperand<VGPR_32_Lo128>;
- let Src1ModDPP = IntT16VRegInputMods</* IsFake16= */ 1>;
+ let Src1ModDPP = IntT16VRegInputMods<1/*IsFake16*/>;
}
let isReMaterializable = 1 in {
let FPDPRounding = 1 in {
let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in
defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", LDEXP_F16_VOPProfile>;
- let SubtargetPredicate = HasTrue16BitInsts in
+ let SubtargetPredicate = UseRealTrue16Insts in
defm V_LDEXP_F16_t16 : VOP2Inst <"v_ldexp_f16_t16", LDEXP_F16_VOPProfile_True16>;
+ let SubtargetPredicate = UseFakeTrue16Insts in
+ defm V_LDEXP_F16_fake16 : VOP2Inst <"v_ldexp_f16_fake16", LDEXP_F16_VOPProfile_Fake16, null_frag, "v_ldexp_f16_fake16">;
} // End FPDPRounding = 1
// FIXME VOP3 Only instructions. NFC using VOPProfile_True16 for these until a planned change to use a new register class for VOP3 encoded True16 instuctions
defm V_LSHLREV_B16 : VOP2Inst_e64_t16 <"v_lshlrev_b16", VOP_I16_I16_I16, clshl_rev_16>;
@@ -968,14 +975,27 @@ class LDEXP_F16_Pat <SDPatternOperator op, VOP_Pseudo inst, VOPProfile P = inst.
let OtherPredicates = [NotHasTrue16BitInsts] in
def : LDEXP_F16_Pat<any_fldexp, V_LDEXP_F16_e64>;
-let OtherPredicates = [HasTrue16BitInsts] in
-def : LDEXP_F16_Pat<any_fldexp, V_LDEXP_F16_t16_e64>;
+class LDEXP_F16_t16_Pat <SDPatternOperator op, VOP_Pseudo inst, VOPProfile P = inst.Pfl> : GCNPat <
+ (P.DstVT (op (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (i16 (VOP3Mods0 P.Src1VT:$src1, i32:$src1_modifiers)))),
+ (inst $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $clamp, /* clamp */
+ $omod, /* omod */
+ 0) /* op_sel */
+>;
+
+let OtherPredicates = [UseRealTrue16Insts] in
+def : LDEXP_F16_t16_Pat<any_fldexp, V_LDEXP_F16_t16_e64>;
+
+let OtherPredicates = [UseFakeTrue16Insts] in
+def : LDEXP_F16_Pat<any_fldexp, V_LDEXP_F16_fake16_e64>;
let SubtargetPredicate = isGFX11Plus in {
let isCommutable = 1 in {
- defm V_AND_B16_t16 : VOP2Inst_e64 <"v_and_b16_t16", VOPProfile_Fake16<VOP_I16_I16_I16>, and>;
- defm V_OR_B16_t16 : VOP2Inst_e64 <"v_or_b16_t16", VOPProfile_Fake16<VOP_I16_I16_I16>, or>;
- defm V_XOR_B16_t16 : VOP2Inst_e64 <"v_xor_b16_t16", VOPProfile_Fake16<VOP_I16_I16_I16>, xor>;
+ defm V_AND_B16_t16 : VOP2Inst_e64 <"v_and_b16_t16", VOPProfile_True16<VOP_I16_I16_I16>, and>;
+ defm V_OR_B16_t16 : VOP2Inst_e64 <"v_or_b16_t16", VOPProfile_True16<VOP_I16_I16_I16>, or>;
+ defm V_XOR_B16_t16 : VOP2Inst_e64 <"v_xor_b16_t16", VOPProfile_True16<VOP_I16_I16_I16>, xor>;
} // End isCommutable = 1
} // End SubtargetPredicate = isGFX11Plus
@@ -1714,6 +1734,7 @@ defm V_MUL_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x035, "v_mul_f16">;
defm V_MUL_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x035, "v_mul_f16">;
defm V_FMAC_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x036, "v_fmac_f16">;
defm V_LDEXP_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x03b, "v_ldexp_f16">;
+defm V_LDEXP_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x03b, "v_ldexp_f16">;
defm V_MAX_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x039, "v_max_f16">;
defm V_MAX_F16_fake16 : VOP2_Real_FULL_t16_gfx11<0x039, "v_max_f16">;
defm V_MIN_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x03a, "v_min_f16">;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 970bb08e1838b2..1228231a53bcce 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -17180,11 +17180,17 @@ define bfloat @v_fabs_bf16(bfloat %a) {
; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fabs_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fabs_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fabs_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.fabs.bf16(bfloat %a)
ret bfloat %op
}
@@ -17266,11 +17272,17 @@ define bfloat @v_fneg_bf16(bfloat %a) {
; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fneg_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fneg_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fneg bfloat %a
ret bfloat %op
}
@@ -17365,11 +17377,17 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GFX10-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_fabs_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fneg_fabs_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fneg_fabs_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_or_b32_e32 v0, 0x8000, v0
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
%op = fneg bfloat %fabs
ret bfloat %op
@@ -34518,15 +34536,25 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_select_fneg_lhs_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_select_fneg_lhs_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v1.l
+; GFX11TRUE16-NEX...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/102620
More information about the llvm-commits
mailing list