[llvm] [AMDGPU] Add pseudo scalar trans instructions for GFX12 (PR #75204)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 12 07:45:04 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mc
Author: Mirko BrkuĊĦanin (mbrkusanin)
<details>
<summary>Changes</summary>
---
Patch is 122.72 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/75204.diff
18 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+10)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+20-8)
- (modified) llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp (+1)
- (modified) llvm/lib/Target/AMDGPU/GCNProcessors.td (+2-2)
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+3)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+14-2)
- (modified) llvm/lib/Target/AMDGPU/SISchedule.td (+36)
- (modified) llvm/lib/Target/AMDGPU/SOPInstructions.td (+22-12)
- (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+53)
- (modified) llvm/lib/Target/AMDGPU/VOPInstructions.td (+13)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pseudo-scalar-transcendental.mir (+261)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll (+89)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-pseudo-scalar-transcendental.mir (+243)
- (added) llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll (+218)
- (added) llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll (+357)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vop3.s (+570)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt (+569)
- (added) llvm/test/tools/llvm-mca/AMDGPU/gfx12-pseudo-scalar-trans.s (+103)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 799e102d56174d..8de9b642493f49 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -816,6 +816,12 @@ def FeatureVGPRSingleUseHintInsts : SubtargetFeature<"vgpr-singleuse-hint",
"Has single-use VGPR hint instructions"
>;
+def FeaturePseudoScalarTrans : SubtargetFeature<"pseudo-scalar-trans",
+ "HasPseudoScalarTrans",
+ "true",
+ "Has Pseudo Scalar Transcendental instructions"
+>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
@@ -1461,6 +1467,7 @@ def FeatureISAVersion12 : FeatureSet<
FeaturePackedTID,
FeatureVcmpxPermlaneHazard,
FeatureSALUFloatInsts,
+ FeaturePseudoScalarTrans,
FeatureVGPRSingleUseHintInsts,
FeatureMADIntraFwdBug]>;
@@ -2002,6 +2009,9 @@ def HasSALUFloatInsts : Predicate<"Subtarget->hasSALUFloatInsts()">,
def HasVGPRSingleUseHintInsts : Predicate<"Subtarget->hasVGPRSingleUseHintInsts()">,
AssemblerPredicate<(all_of FeatureVGPRSingleUseHintInsts)>;
+def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
+ AssemblerPredicate<(all_of FeaturePseudoScalarTrans)>;
+
def HasGDS : Predicate<"Subtarget->hasGDS()">;
def HasGWS : Predicate<"Subtarget->hasGWS()">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 62996a3b3fb79f..94f79656ae829a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3758,14 +3758,20 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
}
+ case AMDGPU::G_FSQRT:
+ case AMDGPU::G_FEXP2:
+ case AMDGPU::G_FLOG2: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
+ isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
+ }
case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
case AMDGPU::G_SSUBSAT:
case AMDGPU::G_UADDSAT:
case AMDGPU::G_USUBSAT:
case AMDGPU::G_FMAD:
- case AMDGPU::G_FSQRT:
- case AMDGPU::G_FEXP2:
- case AMDGPU::G_FLOG2:
case AMDGPU::G_FLDEXP:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
@@ -4230,12 +4236,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_sin:
case Intrinsic::amdgcn_cos:
case Intrinsic::amdgcn_log_clamp:
- case Intrinsic::amdgcn_log:
- case Intrinsic::amdgcn_exp2:
- case Intrinsic::amdgcn_rcp:
case Intrinsic::amdgcn_rcp_legacy:
- case Intrinsic::amdgcn_sqrt:
- case Intrinsic::amdgcn_rsq:
case Intrinsic::amdgcn_rsq_legacy:
case Intrinsic::amdgcn_rsq_clamp:
case Intrinsic::amdgcn_fmul_legacy:
@@ -4292,6 +4293,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
return getDefaultMappingVOP(MI);
+ case Intrinsic::amdgcn_log:
+ case Intrinsic::amdgcn_exp2:
+ case Intrinsic::amdgcn_rcp:
+ case Intrinsic::amdgcn_rsq:
+ case Intrinsic::amdgcn_sqrt: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
+ isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
+ }
case Intrinsic::amdgcn_sbfe:
case Intrinsic::amdgcn_ubfe:
if (isSALUMapping(MI))
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index ed019d26c1dfd8..84bf73e610733b 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -202,6 +202,7 @@ DECODE_OPERAND_REG_8(VReg_512)
DECODE_OPERAND_REG_8(VReg_1024)
DECODE_OPERAND_REG_7(SReg_32, OPW32)
+DECODE_OPERAND_REG_7(SReg_32_XEXEC, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)
DECODE_OPERAND_REG_7(SReg_64, OPW64)
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 80669c04f2c688..96af1a6aab3da7 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -284,10 +284,10 @@ def : ProcessorModel<"gfx1151", GFX11SpeedModel,
// GCN GFX12.
//===----------------------------------------------------------------------===//
-def : ProcessorModel<"gfx1200", GFX11SpeedModel,
+def : ProcessorModel<"gfx1200", GFX12SpeedModel,
FeatureISAVersion12.Features
>;
-def : ProcessorModel<"gfx1201", GFX11SpeedModel,
+def : ProcessorModel<"gfx1201", GFX12SpeedModel,
FeatureISAVersion12.Features
>;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 94b9e49b765a6f..e9b5b67acb50b5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -197,6 +197,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool ScalarizeGlobal = false;
bool HasSALUFloatInsts = false;
bool HasVGPRSingleUseHintInsts = false;
+ bool HasPseudoScalarTrans = false;
bool HasVcmpxPermlaneHazard = false;
bool HasVMEMtoScalarWriteHazard = false;
@@ -1152,6 +1153,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; }
+ bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
+
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
/// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d4e4526795f3b3..c72f985e94ceda 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5161,6 +5161,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
// clang-format off
unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
+ // clang-format on
switch (MI.getOpcode()) {
default: return AMDGPU::INSTRUCTION_LIST_END;
case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
@@ -5294,7 +5295,18 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64;
case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
- }
+ case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
+ case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_t16_e64;
+ case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
+ case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_t16_e64;
+ case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
+ case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_t16_e64;
+ case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
+ case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_t16_e64;
+ case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
+ case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_t16_e64;
+ }
+ // clang-format on
llvm_unreachable(
"Unexpected scalar opcode without corresponding vector one!");
}
@@ -7145,7 +7157,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
// Use the new VALU Opcode.
auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
.setMIFlags(Inst.getFlags());
- if (isVOP3(NewOpcode)) {
+ if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
// Intersperse VOP3 modifiers among the SALU operands.
NewInstr->addOperand(Inst.getOperand(0));
if (AMDGPU::getNamedOperandIdx(NewOpcode,
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index c67e647a7e7c70..b0e8e4112254d8 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -68,6 +68,9 @@ def Write8PassDGEMM : SchedWrite;
// Scalar float instructions
def WriteSFPU : SchedWrite;
+// F16 or F32 pseudo scalar transcendental instructions
+def WritePseudoScalarTrans : SchedWrite;
+
// FIXME: Should there be a class for instructions which are VALU
// instructions and have VALU rates, but write to the SALU (i.e. VOPC
// instructions)
@@ -93,6 +96,7 @@ def SIDPFullSpeedModel : SISchedMachineModel;
def SIDPGFX940FullSpeedModel : SISchedMachineModel;
def GFX10SpeedModel : SISchedMachineModel;
def GFX11SpeedModel : SISchedMachineModel;
+def GFX12SpeedModel : SISchedMachineModel;
// XXX: Are the resource counts correct?
def HWBranch : ProcResource<1> {
@@ -174,6 +178,7 @@ multiclass SICommonWriteRes {
def : HWWriteRes<Write16PassMAI, [HWXDL], 16>;
def : UnsupportedWriteRes<WriteSFPU>;
+ def : UnsupportedWriteRes<WritePseudoScalarTrans>;
} // End RetireOOO = 1
def : ReadAdvance<MIVGPRRead, -2>;
@@ -318,6 +323,7 @@ def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
def : UnsupportedWriteRes<WriteSFPU>;
+def : UnsupportedWriteRes<WritePseudoScalarTrans>;
} // End RetireOOO = 1
def : InstRW<[WriteCopy], (instrs COPY)>;
@@ -351,6 +357,36 @@ def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
} // End RetireOOO = 1
+def : UnsupportedWriteRes<WritePseudoScalarTrans>;
+
def : InstRW<[WriteCopy], (instrs COPY)>;
} // End SchedModel = GFX11SpeedModel
+
+let SchedModel = GFX12SpeedModel in {
+
+def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>;
+def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>;
+def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>;
+def : HWWriteRes<WriteTrans32, [HWVALU, HWRC], 10>;
+def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 8>;
+def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>;
+def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 38>;
+def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 38>;
+def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 38>;
+def : HWWriteRes<WriteIntMul, [HWVALU, HWRC], 8>;
+def : HWWriteRes<WriteTrans64, [HWVALU, HWRC], 40>;
+def : HWWriteRes<WritePseudoScalarTrans, [HWVALU, HWRC], 7>;
+
+def : HWWriteRes<WriteBranch, [HWBranch], 32>;
+def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>;
+def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>;
+def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 2>;
+def : HWWriteRes<WriteSFPU, [HWSALU, HWRC], 4>;
+def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>;
+def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
+def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
+
+def : InstRW<[WriteCopy], (instrs COPY)>;
+
+} // End SchedModel = GFX12SpeedModel
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 9ff64968ef01b2..ddd6f8bfe32dc6 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -592,19 +592,8 @@ let SubtargetPredicate = isGFX12Plus in {
} // End SubtargetPredicate = isGFX12Plus
-def SelectPat : PatFrag <
- (ops node:$src1, node:$src2),
- (select SCC, $src1, $src2),
- [{ return !N->isDivergent(); }]
->;
-
let Uses = [SCC] in {
- let AddedComplexity = 20 in {
- def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32",
- [(set i32:$sdst, (SelectPat i32:$src0, i32:$src1))]
- >;
- }
-
+ def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32">;
def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64">;
} // End Uses = [SCC]
@@ -1697,6 +1686,27 @@ def : GetFPModePat<fpmode_mask_gfx6plus>;
// SOP2 Patterns
//===----------------------------------------------------------------------===//
+def UniformSelect : PatFrag<
+ (ops node:$src0, node:$src1),
+ (select SCC, $src0, $src1),
+ [{ return !N->isDivergent(); }]
+>;
+
+let AddedComplexity = 20 in {
+ def : GCNPat<
+ (i32 (UniformSelect i32:$src0, i32:$src1)),
+ (S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)
+ >;
+
+ // TODO: The predicate should not be necessary, but enabling this pattern for
+ // all subtargets generates worse code in some cases.
+ let OtherPredicates = [HasPseudoScalarTrans] in
+ def : GCNPat<
+ (f32 (UniformSelect f32:$src0, f32:$src1)),
+ (S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)
+ >;
+}
+
// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector
// case, the sgpr-copies pass will fix this to use the vector version.
def : GCNPat <
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 617773b34ae98c..0f5631195d0c19 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -810,6 +810,49 @@ let SubtargetPredicate = HasDot9Insts, IsDOT=1 in {
defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile<VOP_I16_V2I16_V2I16_I16>, int_amdgcn_fdot2_bf16_bf16>;
}
+class VOP_Pseudo_Scalar<RegisterClass Dst, RegisterOperand SrcOp,
+ ValueType dstVt, ValueType srcVt = dstVt>
+ : VOPProfile<[dstVt, srcVt, untyped, untyped]> {
+ let DstRC = VOPDstOperand<Dst>;
+ let Src0RC64 = SrcOp;
+
+ let HasOMod = 1;
+ let HasModifiers = 1;
+}
+
+def VOP_Pseudo_Scalar_F32 : VOP_Pseudo_Scalar<SReg_32_XEXEC, SSrc_f32, f32>;
+def VOP_Pseudo_Scalar_F16 : VOP_Pseudo_Scalar<SReg_32_XEXEC, SSrc_f16, f32, f16>;
+
+let SubtargetPredicate = HasPseudoScalarTrans, TRANS = 1,
+ isReMaterializable = 1, SchedRW = [WritePseudoScalarTrans] in {
+ defm V_S_EXP_F32 : VOP3PseudoScalarInst<"v_s_exp_f32", VOP_Pseudo_Scalar_F32, AMDGPUexp>;
+ defm V_S_EXP_F16 : VOP3PseudoScalarInst<"v_s_exp_f16", VOP_Pseudo_Scalar_F16>;
+ defm V_S_LOG_F32 : VOP3PseudoScalarInst<"v_s_log_f32", VOP_Pseudo_Scalar_F32, AMDGPUlog>;
+ defm V_S_LOG_F16 : VOP3PseudoScalarInst<"v_s_log_f16", VOP_Pseudo_Scalar_F16>;
+ defm V_S_RCP_F32 : VOP3PseudoScalarInst<"v_s_rcp_f32", VOP_Pseudo_Scalar_F32, AMDGPUrcp>;
+ defm V_S_RCP_F16 : VOP3PseudoScalarInst<"v_s_rcp_f16", VOP_Pseudo_Scalar_F16>;
+ defm V_S_RSQ_F32 : VOP3PseudoScalarInst<"v_s_rsq_f32", VOP_Pseudo_Scalar_F32, AMDGPUrsq>;
+ defm V_S_RSQ_F16 : VOP3PseudoScalarInst<"v_s_rsq_f16", VOP_Pseudo_Scalar_F16>;
+ defm V_S_SQRT_F32 : VOP3PseudoScalarInst<"v_s_sqrt_f32", VOP_Pseudo_Scalar_F32, any_amdgcn_sqrt>;
+ defm V_S_SQRT_F16 : VOP3PseudoScalarInst<"v_s_sqrt_f16", VOP_Pseudo_Scalar_F16>;
+}
+
+class PseudoScalarPatF16<SDPatternOperator node, VOP3_Pseudo inst> : GCNPat <
+ (f16 (UniformUnaryFrag<node> (f16 (VOP3Mods0 f16:$src0, i32:$src0_modifiers,
+ i1:$clamp, i32:$omod)))),
+ (f16 (COPY_TO_REGCLASS (f32 (inst i32:$src0_modifiers, f16:$src0, i1:$clamp,
+ i32:$omod)),
+ SReg_32_XEXEC))
+>;
+
+let SubtargetPredicate = HasPseudoScalarTrans in {
+ def : PseudoScalarPatF16<AMDGPUexpf16, V_S_EXP_F16_e64>;
+ def : PseudoScalarPatF16<AMDGPUlogf16, V_S_LOG_F16_e64>;
+ def : PseudoScalarPatF16<AMDGPUrcp, V_S_RCP_F16_e64>;
+ def : PseudoScalarPatF16<AMDGPUrsq, V_S_RSQ_F16_e64>;
+ def : PseudoScalarPatF16<any_amdgcn_sqrt, V_S_SQRT_F16_e64>;
+}
+
//===----------------------------------------------------------------------===//
// Integer Clamp Patterns
//===----------------------------------------------------------------------===//
@@ -868,6 +911,16 @@ defm V_MINMAX_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x268, "V_MINMAX_F32
defm V_MAXMIN_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x269, "V_MAXMIN_F32", "v_maxmin_num_f32">;
defm V_MINMAX_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x26a, "V_MINMAX_F16", "v_minmax_num_f16">;
defm V_MAXMIN_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x26b, "V_MAXMIN_F16", "v_maxmin_num_f16">;
+defm V_S_EXP_F32 : VOP3Only_Real_Base_gfx12<0x280>;
+defm V_S_EXP_F16 : VOP3Only_Real_Base_gfx12<0x281>;
+defm V_S_LOG_F32 : VOP3Only_Real_Base_gfx12<0x282>;
+defm V_S_LOG_F16 : VOP3Only_Real_Base_gfx12<0x283>;
+defm V_S_RCP_F32 : VOP3Only_Real_Base_gfx12<0x284>;
+defm V_S_RCP_F16 : VOP3Only_Real_Base_gfx12<0x285>;
+defm V_S_RSQ_F32 : VOP3Only_Real_Base_gfx12<0x286>;
+defm V_S_RSQ_F16 : VOP3Only_Real_Base_gfx12<0x287>;
+defm V_S_SQRT_F32 : VOP3Only_Real_Base_gfx12<0x288>;
+defm V_S_SQRT_F16 : VOP3Only_Real_Base_gfx12<0x289>;
defm V_MAD_CO_U64_U32 : VOP3be_Real_with_name_gfx12<0x2fe, "V_MAD_U64_U32", "v_mad_co_u64_u32">;
defm V_MAD_CO_I64_I32 : VOP3be_Real_with_name_gfx12<0x2ff, "V_MAD_I64_I32", "v_mad_co_i64_i32">;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 96184486e06f77..fd4626d902acea 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1303,6 +1303,19 @@ multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_f
} // end SubtargetPredicate = isGFX11Plus
}
+class UniformUnaryFragOrOp<SDPatternOperator Op> {
+ SDPatternOperator ret = !if(!or(!isa<SDNode>(Op), !isa<PatFrags>(Op)),
+ UniformUnaryFrag<Op>, Op);
+}
+
+multiclass VOP3PseudoScalarInst<string OpName, VOPProfile P,
+ SDPatternOperator node = null_frag> {
+ def _e64 : VOP3_Pseudo<OpName, P, [(set P.DstVT:$vdst,
+ (UniformUnaryFragOrOp<node>.ret
+ (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp,
+ i32:$omod))))]>;
+}
+
//===----------------------------------------------------------------------===//
// VOP3 DPP
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pseudo-scalar-transcendental.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pseudo-scalar-transcendental.mir
new file mode 100644
index 00000000000000..92cffb1b47bd1c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pseudo-scalar-transcendental.mir
@@ -0,0 +1,261 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: v_s_exp_f32
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_exp_f32
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[V_S_EXP_F32_e64_:%[0-9]+]]:sreg_32_xexec = nofpexcept V_S_EXP_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_S_EXP_F32_e64_]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), %0
+ $vgpr0 = COPY %1(s32)
+
+...
+---
+name: v_s_exp_f16
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: v_s_exp_f16
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[V_S_EXP_F16_e64_:%[0-9]+]]:sreg_32_xexec = nofpexcept V_S_EXP_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[V_S_EXP_F16_e64_]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.exp2), %1
+ %3:sgpr(s32) = G_ANYEXT %2(s16)
+ $vgpr0 = COPY %3(s32)
+
+...
+---
+name: v_s_log_f32
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/75204
More information about the llvm-commits
mailing list