[llvm] 20d2015 - [AMDGPU] gfx11 VINTERP intrinsics and ISel support
Joe Nash via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 17 06:46:28 PDT 2022
Author: Joe Nash
Date: 2022-06-17T09:16:59-04:00
New Revision: 20d20156f4ce478407c4d5a0ef89afce17eddcf0
URL: https://github.com/llvm/llvm-project/commit/20d20156f4ce478407c4d5a0ef89afce17eddcf0
DIFF: https://github.com/llvm/llvm-project/commit/20d20156f4ce478407c4d5a0ef89afce17eddcf0.diff
LOG: [AMDGPU] gfx11 VINTERP intrinsics and ISel support
Depends on D127664
Reviewed By: rampitec, #amdgpu
Differential Revision: https://reviews.llvm.org/D127756
Added:
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
Modified:
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPUGISel.td
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/lib/Target/AMDGPU/VINTERPInstructions.td
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index c2a6534def95..f55b83bab699 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1505,6 +1505,34 @@ def int_amdgcn_lds_param_load :
[IntrNoMem, IntrSpeculatable, IntrWillReturn,
ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
+// __int_amdgcn_interp_inreg_p10 <p>, <i>, <p0>
+def int_amdgcn_interp_inreg_p10 :
+ Intrinsic<[llvm_float_ty],
+ [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+ [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+
+// __int_amdgcn_interp_inreg_p2 <p>, <j>, <tmp>
+def int_amdgcn_interp_inreg_p2 :
+ Intrinsic<[llvm_float_ty],
+ [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+ [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+
+// __int_amdgcn_interp_inreg_p10_f16 <p>, <i>, <p0>, <high>
+// high selects whether high or low 16-bits are used for p and p0 operands
+def int_amdgcn_interp_inreg_p10_f16:
+ Intrinsic<[llvm_float_ty],
+ [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
+ [IntrNoMem, IntrSpeculatable, IntrWillReturn,
+ ImmArg<ArgIndex<3>>]>;
+
+// __int_amdgcn_interp_inreg_p2_f16 <p>, <j>, <tmp>, <high>
+// high selects whether high or low 16-bits are used for p operand
+def int_amdgcn_interp_inreg_p2_f16 :
+ Intrinsic<[llvm_half_ty],
+ [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
+ [IntrNoMem, IntrSpeculatable, IntrWillReturn,
+ ImmArg<ArgIndex<3>>]>;
+
// Deprecated: use llvm.amdgcn.live.mask instead.
def int_amdgcn_ps_live : Intrinsic <
[llvm_i1_ty],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index eba63ccdcba1..af4b4895809f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -59,6 +59,14 @@ def gi_vop3opselmods :
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
GIComplexPatternEquiv<VOP3OpSelMods>;
+def gi_vinterpmods :
+ GIComplexOperandMatcher<s32, "selectVINTERPMods">,
+ GIComplexPatternEquiv<VINTERPMods>;
+
+def gi_vinterpmods_hi :
+ GIComplexOperandMatcher<s32, "selectVINTERPModsHi">,
+ GIComplexPatternEquiv<VINTERPModsHi>;
+
// FIXME: Why do we have both VOP3OpSel and VOP3OpSelMods?
def gi_vop3opsel :
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index e25165eac914..19648310f607 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -13,7 +13,9 @@
#include "AMDGPUISelDAGToDAG.h"
#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "MCTargetDesc/R600MCTargetDesc.h"
#include "R600RegisterInfo.h"
#include "SIMachineFunctionInfo.h"
@@ -2606,6 +2608,30 @@ bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
return true;
}
+bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
+ SDValue &SrcMods,
+ bool OpSel) const {
+ unsigned Mods;
+ if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) {
+ if (OpSel)
+ Mods |= SISrcMods::OP_SEL_0;
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+ }
+
+ return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
+}
+
+bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
SDValue &SrcMods, SDValue &Clamp,
SDValue &Omod) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 69d79fc815a3..28c79d83dbfb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -218,6 +218,11 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
SDValue &Clamp, SDValue &Omod) const;
+ bool SelectVINTERPModsImpl(SDValue In, SDValue &Src, SDValue &SrcMods,
+ bool OpSel) const;
+ bool SelectVINTERPMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVINTERPModsHi(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+
bool SelectVOP3OMods(SDValue In, SDValue &Src, SDValue &Clamp,
SDValue &Omod) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index f89cdd42ee45..6bf80e0dc90c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3497,9 +3497,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
}
-std::pair<Register, unsigned>
-AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
- bool AllowAbs) const {
+std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
+ MachineOperand &Root, bool AllowAbs, bool OpSel, bool ForceVGPR) const {
Register Src = Root.getReg();
Register OrigSrc = Src;
unsigned Mods = 0;
@@ -3516,7 +3515,10 @@ AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
Mods |= SISrcMods::ABS;
}
- if (Mods != 0 &&
+ if (OpSel)
+ Mods |= SISrcMods::OP_SEL_0;
+
+ if ((Mods != 0 || ForceVGPR) &&
RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
MachineInstr *UseMI = Root.getParent();
@@ -3708,6 +3710,36 @@ AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
}};
}
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
+ /* AllowAbs */ false,
+ /* OpSel */ false,
+ /* ForceVGPR */ true);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
+ /* AllowAbs */ false,
+ /* OpSel */ true,
+ /* ForceVGPR */ true);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
+ }};
+}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
SmallVector<GEPInfo, 4> AddrInfo;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 27565f9ca687..fcf96cb85d9a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -150,8 +150,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectSMFMACIntrin(MachineInstr &I) const;
bool selectWaveAddress(MachineInstr &I) const;
- std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
- bool AllowAbs = true) const;
+ std::pair<Register, unsigned>
+ selectVOP3ModsImpl(MachineOperand &Root, bool AllowAbs = true,
+ bool OpSel = false, bool ForceVGPR = false) const;
InstructionSelector::ComplexRendererFns
selectVCSRC(MachineOperand &Root) const;
@@ -191,6 +192,11 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
InstructionSelector::ComplexRendererFns
selectVOP3OpSelMods(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectVINTERPMods(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectVINTERPModsHi(MachineOperand &Root) const;
+
InstructionSelector::ComplexRendererFns
selectSmrdImm(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 8f046d07ed06..79fdae5bcee3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3017,6 +3017,12 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
return;
}
+ case Intrinsic::amdgcn_interp_inreg_p10:
+ case Intrinsic::amdgcn_interp_inreg_p2:
+ case Intrinsic::amdgcn_interp_inreg_p10_f16:
+ case Intrinsic::amdgcn_interp_inreg_p2_f16:
+ applyDefaultMapping(OpdMapper);
+ return;
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16: {
// Doing a waterfall loop over these wouldn't make any sense.
@@ -4469,6 +4475,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
break;
}
+ case Intrinsic::amdgcn_interp_inreg_p10:
+ case Intrinsic::amdgcn_interp_inreg_p2:
+ case Intrinsic::amdgcn_interp_inreg_p10_f16:
+ case Intrinsic::amdgcn_interp_inreg_p2_f16: {
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ break;
+ }
case Intrinsic::amdgcn_ballot: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
diff --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
index 11553c3d1c9a..c63fbbc241d9 100644
--- a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
@@ -111,10 +111,55 @@ def V_INTERP_P2_RTZ_F16_F32_inreg :
} // SubtargetPredicate = isGFX11Plus
+class VInterpF32Pat <SDPatternOperator op, Instruction inst> : GCNPat <
+ (f32 (op
+ (VINTERPMods f32:$src0, i32:$src0_modifiers),
+ (VINTERPMods f32:$src1, i32:$src1_modifiers),
+ (VINTERPMods f32:$src2, i32:$src2_modifiers))),
+ (inst $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ 0, /* clamp */
+ 7) /* wait_exp */
+>;
+
def VINTERP_OPSEL {
int LOW = 0;
int HIGH = 0xa;
}
+
+class VInterpF16Pat <SDPatternOperator op, Instruction inst,
+ ValueType dst_type, bit high,
+ list<ComplexPattern> pat> : GCNPat <
+ (dst_type (op
+ (pat[0] f32:$src0, i32:$src0_modifiers),
+ (pat[1] f32:$src1, i32:$src1_modifiers),
+ (pat[2] f32:$src2, i32:$src2_modifiers),
+ !if(high, (i1 -1), (i1 0)))),
+ (inst $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ 0, /* clamp */
+ /* op_sel = 0 */
+ 7) /* wait_exp */
+>;
+
+multiclass VInterpF16Pat <SDPatternOperator op, Instruction inst,
+ ValueType dst_type, list<ComplexPattern> high_pat> {
+ def : VInterpF16Pat<op, inst, dst_type, 0,
+ [VINTERPMods, VINTERPMods, VINTERPMods]>;
+ def : VInterpF16Pat<op, inst, dst_type, 1, high_pat>;
+}
+
+def : VInterpF32Pat<int_amdgcn_interp_inreg_p10, V_INTERP_P10_F32_inreg>;
+def : VInterpF32Pat<int_amdgcn_interp_inreg_p2, V_INTERP_P2_F32_inreg>;
+defm : VInterpF16Pat<int_amdgcn_interp_inreg_p10_f16,
+ V_INTERP_P10_F16_F32_inreg, f32,
+ [VINTERPModsHi, VINTERPMods, VINTERPModsHi]>;
+defm : VInterpF16Pat<int_amdgcn_interp_inreg_p2_f16,
+ V_INTERP_P2_F16_F32_inreg, f16,
+ [VINTERPModsHi, VINTERPMods, VINTERPMods]>;
+
//===----------------------------------------------------------------------===//
// VINTERP Real Instructions
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
new file mode 100644
index 000000000000..b2e1542c1bc0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_f32:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: lds_param_load v0, attr0.y
+; GCN-NEXT: lds_param_load v1, attr1.x
+; GCN-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:7
+; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:7
+; GCN-NEXT: v_interp_p2_f32 v0, v0, v4, v3 wait_exp:7
+; GCN-NEXT: v_interp_p2_f32 v1, v1, v4, v0 wait_exp:7
+; GCN-NEXT: exp mrt0 v3, v2, v0, v1 done
+; GCN-NEXT: s_endpgm
+main_body:
+ %p0 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %m0)
+ %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
+ %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0)
+ %p1_0 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0)
+ %p0_1 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1)
+ %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0)
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_0, float %p0_1, float %p1_0, float %p1_1, i1 true, i1 true) #0
+ ret void
+}
+
+define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_f32_many:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: lds_param_load v0, attr0.x
+; GCN-NEXT: lds_param_load v1, attr1.x
+; GCN-NEXT: lds_param_load v2, attr2.x
+; GCN-NEXT: lds_param_load v3, attr3.x
+; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:7
+; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:7
+; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:7
+; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:7
+; GCN-NEXT: v_interp_p2_f32 v0, v0, v5, v6 wait_exp:7
+; GCN-NEXT: v_interp_p2_f32 v1, v1, v5, v7 wait_exp:7
+; GCN-NEXT: v_interp_p2_f32 v2, v2, v5, v8 wait_exp:7
+; GCN-NEXT: v_interp_p2_f32 v3, v3, v5, v4 wait_exp:7
+; GCN-NEXT: exp mrt0 v0, v1, v2, v3 done
+; GCN-NEXT: s_endpgm
+main_body:
+ %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
+ %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
+ %p2 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 2, i32 %m0)
+ %p3 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 3, i32 %m0)
+ %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0)
+ %p0_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0)
+ %p1_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1)
+ %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0)
+ %p2_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p2, float %i, float %p2)
+ %p2_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p2, float %j, float %p2_0)
+ %p3_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p3, float %i, float %p3)
+ %p3_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p3, float %j, float %p3_0)
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_1, float %p1_1, float %p2_1, float %p3_1, i1 true, i1 true) #0
+ ret void
+}
+
+define amdgpu_ps void @v_interp_f32_many_vm(float addrspace(1)* %ptr, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_f32_many_vm:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4
+; GCN-NEXT: s_mov_b32 m0, s0
+; GCN-NEXT: lds_param_load v2, attr0.x
+; GCN-NEXT: lds_param_load v3, attr1.x
+; GCN-NEXT: lds_param_load v4, attr2.x
+; GCN-NEXT: lds_param_load v5, attr3.x
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:7
+; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:7
+; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:7
+; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:7
+; GCN-NEXT: v_interp_p2_f32 v2, v2, v1, v6 wait_exp:7
+; GCN-NEXT: v_interp_p2_f32 v3, v3, v1, v7 wait_exp:7
+; GCN-NEXT: v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7
+; GCN-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
+; GCN-NEXT: exp mrt0 v2, v3, v4, v0 done
+; GCN-NEXT: s_endpgm
+main_body:
+ %i.ptr = getelementptr float, float addrspace(1)* %ptr, i32 1
+ %i = load float, float addrspace(1)* %i.ptr, align 4
+ %j.ptr = getelementptr float, float addrspace(1)* %ptr, i32 2
+ %j = load float, float addrspace(1)* %j.ptr, align 4
+ %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
+ %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
+ %p2 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 2, i32 %m0)
+ %p3 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 3, i32 %m0)
+ %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0)
+ %p0_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0)
+ %p1_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1)
+ %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0)
+ %p2_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p2, float %i, float %p2)
+ %p2_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p2, float %j, float %p2_0)
+ %p3_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p3, float %i, float %p3)
+ %p3_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p3, float %j, float %p3_0)
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_1, float %p1_1, float %p2_1, float %p3_1, i1 true, i1 true) #0
+ ret void
+}
+
+define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_f16:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: lds_param_load v0, attr0.x
+; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: v_interp_p10_f16_f32 v3, v0, v1, v0 wait_exp:7
+; GCN-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 op_sel:[1,0,1,0] wait_exp:7
+; GCN-NEXT: v_interp_p2_f16_f32 v3, v0, v2, v3 wait_exp:7
+; GCN-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v1 op_sel:[1,0,0,0] wait_exp:7
+; GCN-NEXT: v_add_f16_e32 v0, v3, v0
+; GCN-NEXT: ; return to shader part epilog
+main_body:
+ %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
+ %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 0)
+ %l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float %p0, float %j, float %l_p0, i1 0)
+ %h_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 1)
+ %h_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float %p0, float %j, float %h_p0, i1 1)
+ %res = fadd half %l_p1, %h_p1
+ ret half %res
+}
+
+declare float @llvm.amdgcn.lds.param.load(i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #0
+declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #0
+declare float @llvm.amdgcn.interp.inreg.p10.f16(float, float, float, i1) #0
+declare half @llvm.amdgcn.interp.inreg.p2.f16(float, float, float, i1) #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare void @llvm.amdgcn.exp.f16(i32, i32, float, float, float, float, i1, i1) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
new file mode 100644
index 000000000000..0e00a67fcf98
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_f32:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: lds_param_load v0, attr0.y
+; GCN-NEXT: lds_param_load v1, attr1.x
+; GCN-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:7
+; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:7
+; GCN-NEXT: v_interp_p2_f32 v0, v0, v4, v3 wait_exp:7
+; GCN-NEXT: v_interp_p2_f32 v1, v1, v4, v0 wait_exp:7
+; GCN-NEXT: exp mrt0 v3, v2, v0, v1 done
+; GCN-NEXT: s_endpgm
+main_body:
+ %p0 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %m0)
+ %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
+ %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0)
+ %p1_0 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0)
+ %p0_1 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1)
+ %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0)
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_0, float %p0_1, float %p1_0, float %p1_1, i1 true, i1 true) #0
+ ret void
+}
+
+define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_f32_many:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: lds_param_load v0, attr0.x
+; GCN-NEXT: lds_param_load v1, attr1.x
+; GCN-NEXT: lds_param_load v2, attr2.x
+; GCN-NEXT: lds_param_load v3, attr3.x
+; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:7
+; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:7
+; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:7
+; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:7
+; GCN-NEXT: v_interp_p2_f32 v0, v0, v5, v6 wait_exp:7
+; GCN-NEXT: v_interp_p2_f32 v1, v1, v5, v7 wait_exp:7
+; GCN-NEXT: v_interp_p2_f32 v2, v2, v5, v8 wait_exp:7
+; GCN-NEXT: v_interp_p2_f32 v3, v3, v5, v4 wait_exp:7
+; GCN-NEXT: exp mrt0 v0, v1, v2, v3 done
+; GCN-NEXT: s_endpgm
+main_body:
+ %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
+ %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
+ %p2 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 2, i32 %m0)
+ %p3 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 3, i32 %m0)
+ %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0)
+ %p0_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0)
+ %p1_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1)
+ %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0)
+ %p2_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p2, float %i, float %p2)
+ %p2_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p2, float %j, float %p2_0)
+ %p3_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p3, float %i, float %p3)
+ %p3_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p3, float %j, float %p3_0)
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_1, float %p1_1, float %p2_1, float %p3_1, i1 true, i1 true) #0
+ ret void
+}
+
+define amdgpu_ps void @v_interp_f32_many_vm(float addrspace(1)* %ptr, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_f32_many_vm:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4
+; GCN-NEXT: s_mov_b32 m0, s0
+; GCN-NEXT: lds_param_load v2, attr0.x
+; GCN-NEXT: lds_param_load v3, attr1.x
+; GCN-NEXT: lds_param_load v4, attr2.x
+; GCN-NEXT: lds_param_load v5, attr3.x
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:7
+; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:7
+; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:7
+; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:7
+; GCN-NEXT: v_interp_p2_f32 v2, v2, v1, v6 wait_exp:7
+; GCN-NEXT: v_interp_p2_f32 v3, v3, v1, v7 wait_exp:7
+; GCN-NEXT: v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7
+; GCN-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
+; GCN-NEXT: exp mrt0 v2, v3, v4, v0 done
+; GCN-NEXT: s_endpgm
+main_body:
+ %i.ptr = getelementptr float, float addrspace(1)* %ptr, i32 1
+ %i = load float, float addrspace(1)* %i.ptr, align 4
+ %j.ptr = getelementptr float, float addrspace(1)* %ptr, i32 2
+ %j = load float, float addrspace(1)* %j.ptr, align 4
+ %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
+ %p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
+ %p2 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 2, i32 %m0)
+ %p3 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 3, i32 %m0)
+ %p0_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p0, float %i, float %p0)
+ %p0_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p0, float %j, float %p0_0)
+ %p1_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p1, float %i, float %p1)
+ %p1_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p1, float %j, float %p1_0)
+ %p2_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p2, float %i, float %p2)
+ %p2_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p2, float %j, float %p2_0)
+ %p3_0 = call float @llvm.amdgcn.interp.inreg.p10(float %p3, float %i, float %p3)
+ %p3_1 = call float @llvm.amdgcn.interp.inreg.p2(float %p3, float %j, float %p3_0)
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_1, float %p1_1, float %p2_1, float %p3_1, i1 true, i1 true) #0
+ ret void
+}
+
+define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_f16:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: lds_param_load v0, attr0.x
+; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: v_interp_p10_f16_f32 v3, v0, v1, v0 wait_exp:7
+; GCN-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 op_sel:[1,0,1,0] wait_exp:7
+; GCN-NEXT: v_interp_p2_f16_f32 v3, v0, v2, v3 wait_exp:7
+; GCN-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v1 op_sel:[1,0,0,0] wait_exp:7
+; GCN-NEXT: v_add_f16_e32 v0, v3, v0
+; GCN-NEXT: ; return to shader part epilog
+main_body:
+ %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
+ %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 0)
+ %l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float %p0, float %j, float %l_p0, i1 0)
+ %h_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 1)
+ %h_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float %p0, float %j, float %h_p0, i1 1)
+ %res = fadd half %l_p1, %h_p1
+ ret half %res
+}
+
+declare float @llvm.amdgcn.lds.param.load(i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #0
+declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #0
+declare float @llvm.amdgcn.interp.inreg.p10.f16(float, float, float, i1) #0
+declare half @llvm.amdgcn.interp.inreg.p2.f16(float, float, float, i1) #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare void @llvm.amdgcn.exp.f16(i32, i32, float, float, float, float, i1, i1) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
More information about the llvm-commits
mailing list