[llvm] r338618 - AMDGPU: Partially fix handling of packed amdgpu_ps arguments
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 1 12:57:34 PDT 2018
Author: arsenm
Date: Wed Aug 1 12:57:34 2018
New Revision: 338618
URL: http://llvm.org/viewvc/llvm-project?rev=338618&view=rev
Log:
AMDGPU: Partially fix handling of packed amdgpu_ps arguments
Fixes annoying limitations when writing tests.
Also remove more leftover code for manually scalarizing arguments
and return values.
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUCallingConv.td
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/test/CodeGen/AMDGPU/calling-conventions.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUCallingConv.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUCallingConv.td?rev=338618&r1=338617&r2=338618&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUCallingConv.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUCallingConv.td Wed Aug 1 12:57:34 2018
@@ -19,7 +19,7 @@ class CCIfExtend<CCAction A>
// Calling convention for SI
def CC_SI : CallingConv<[
- CCIfInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[
+ CCIfInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[
SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
@@ -33,7 +33,7 @@ def CC_SI : CallingConv<[
CCIfByVal<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>,
// 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
- CCIfNotInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[
+ CCIfNotInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -64,7 +64,7 @@ def RetCC_SI_Shader : CallingConv<[
]>>,
// 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
- CCIfType<[f32, f16] , CCAssignToReg<[
+ CCIfType<[f32, f16, v2f16] , CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=338618&r1=338617&r2=338618&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Wed Aug 1 12:57:34 2018
@@ -1349,7 +1349,8 @@ static void processShaderInputArgs(Small
for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
const ISD::InputArg *Arg = &Ins[I];
- assert(!Arg->VT.isVector() && "vector type argument should have been split");
+ assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
+ "vector type argument should have been split");
// First check if it's a PS input addr.
if (CallConv == CallingConv::AMDGPU_PS &&
@@ -1951,29 +1952,6 @@ SDValue SITargetLowering::LowerFormalArg
llvm_unreachable("Unknown loc info!");
}
- if (IsShader && Arg.VT.isVector()) {
- // Build a vector from the registers
- Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
- unsigned NumElements = ParamType->getVectorNumElements();
-
- SmallVector<SDValue, 4> Regs;
- Regs.push_back(Val);
- for (unsigned j = 1; j != NumElements; ++j) {
- Reg = ArgLocs[ArgIdx++].getLocReg();
- Reg = MF.addLiveIn(Reg, RC);
-
- SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
- Regs.push_back(Copy);
- }
-
- // Fill up the missing vector elements
- NumElements = Arg.VT.getVectorNumElements() - NumElements;
- Regs.append(NumElements, DAG.getUNDEF(VT));
-
- InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
- continue;
- }
-
InVals.push_back(Val);
}
@@ -2037,48 +2015,19 @@ SITargetLowering::LowerReturn(SDValue Ch
bool IsShader = AMDGPU::isShader(CallConv);
- Info->setIfReturnsVoid(Outs.size() == 0);
+ Info->setIfReturnsVoid(Outs.empty());
bool IsWaveEnd = Info->returnsVoid() && IsShader;
- SmallVector<ISD::OutputArg, 48> Splits;
- SmallVector<SDValue, 48> SplitVals;
-
- // Split vectors into their elements.
- for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
- const ISD::OutputArg &Out = Outs[i];
-
- if (IsShader && Out.VT.isVector()) {
- MVT VT = Out.VT.getVectorElementType();
- ISD::OutputArg NewOut = Out;
- NewOut.Flags.setSplit();
- NewOut.VT = VT;
-
- // We want the original number of vector elements here, e.g.
- // three or five, not four or eight.
- unsigned NumElements = Out.ArgVT.getVectorNumElements();
-
- for (unsigned j = 0; j != NumElements; ++j) {
- SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
- DAG.getConstant(j, DL, MVT::i32));
- SplitVals.push_back(Elem);
- Splits.push_back(NewOut);
- NewOut.PartOffset += NewOut.VT.getStoreSize();
- }
- } else {
- SplitVals.push_back(OutVals[i]);
- Splits.push_back(Out);
- }
- }
-
// CCValAssign - represent the assignment of the return value to a location.
SmallVector<CCValAssign, 48> RVLocs;
+ SmallVector<ISD::OutputArg, 48> Splits;
// CCState - Info about the registers and stack slots.
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
// Analyze outgoing return values.
- CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
+ CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
SDValue Flag;
SmallVector<SDValue, 48> RetOps;
@@ -2103,14 +2052,12 @@ SITargetLowering::LowerReturn(SDValue Ch
}
// Copy the result values into the output registers.
- for (unsigned i = 0, realRVLocIdx = 0;
- i != RVLocs.size();
- ++i, ++realRVLocIdx) {
- CCValAssign &VA = RVLocs[i];
+ for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
+ ++I, ++RealRVLocIdx) {
+ CCValAssign &VA = RVLocs[I];
assert(VA.isRegLoc() && "Can only return in registers!");
// TODO: Partially return in registers if return values don't fit.
-
- SDValue Arg = SplitVals[realRVLocIdx];
+ SDValue Arg = OutVals[RealRVLocIdx];
// Copied from other backends.
switch (VA.getLocInfo()) {
Modified: llvm/trunk/test/CodeGen/AMDGPU/calling-conventions.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/calling-conventions.ll?rev=338618&r1=338617&r2=338618&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/calling-conventions.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/calling-conventions.ll Wed Aug 1 12:57:34 2018
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; Make sure we don't crash or assert on spir_kernel calling convention.
@@ -88,8 +88,8 @@ define amdgpu_cs half @cs_mesa(half %arg
; Mesa pixel shader: check for 45096 (SPI_SHADER_PGM_RSRC1_PS) in .AMDGPU.config
; GCN-LABEL: .AMDGPU.config
; GCN: .long 45096
-; GCN-LABEL: {{^}}ps_mesa:
-define amdgpu_ps half @ps_mesa(half %arg0) {
+; GCN-LABEL: {{^}}ps_mesa_f16:
+define amdgpu_ps half @ps_mesa_f16(half %arg0) {
%add = fadd half %arg0, 1.0
ret half %add
}
@@ -121,4 +121,83 @@ define amdgpu_hs half @hs_mesa(half %arg
ret half %add
}
+; FIXME: Inconsistent ABI between targets
+; GCN-LABEL: {{^}}ps_mesa_v2f16:
+; VI: v_mov_b32_e32 v1, 0x3c00
+; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: ; return
+
+; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT0:v[0-9]+]], v0
+; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT1:v[0-9]+]], v1
+; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT0:v[0-9]+]], [[CVT_ELT0]]
+; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT1:v[0-9]+]], [[CVT_ELT1]]
+; SI-DAG: v_add_f32_e32 v0, 1.0, [[RECVT_ELT0]]
+; SI-DAG: v_add_f32_e32 v1, 1.0, [[RECVT_ELT1]]
+; SI: ; return to shader part epilog
+define amdgpu_ps <2 x half> @ps_mesa_v2f16(<2 x half> %arg0) {
+ %add = fadd <2 x half> %arg0, <half 1.0, half 1.0>
+ ret <2 x half> %add
+}
+
+; GCN-LABEL: {{^}}ps_mesa_inreg_v2f16:
+; VI: s_lshr_b32 s1, s0, 16
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0x3c00
+; VI-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_add_f16_e64 v1, s0, 1.0
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: ; return to shader part epilog
+
+; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT0:v[0-9]+]], s0
+; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT1:v[0-9]+]], s1
+; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT0:v[0-9]+]], [[CVT_ELT0]]
+; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT1:v[0-9]+]], [[CVT_ELT1]]
+; SI-DAG: v_add_f32_e32 v0, 1.0, [[RECVT_ELT0]]
+; SI-DAG: v_add_f32_e32 v1, 1.0, [[RECVT_ELT1]]
+; SI: ; return to shader part epilog
+define amdgpu_ps <2 x half> @ps_mesa_inreg_v2f16(<2 x half> inreg %arg0) {
+ %add = fadd <2 x half> %arg0, <half 1.0, half 1.0>
+ ret <2 x half> %add
+}
+
+; GCN-LABEL: {{^}}ps_mesa_v2i16:
+; VI: v_mov_b32_e32 v2, 1
+; VI: v_add_u16_e32 v1, 1, v0
+; VI: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI: v_or_b32_e32 v0, v1, v0
+
+
+; SI: v_lshlrev_b32_e32 v1, 16, v1
+; SI: v_add_i32_e32 v0, vcc, 1, v0
+; SI: v_add_i32_e32 v1, vcc, 0x10000, v1
+; SI: v_and_b32
+; SI: v_or_b32
+define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) {
+ %add = add <2 x i16> %arg0, <i16 1, i16 1>
+ store <2 x i16> %add, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}ps_mesa_inreg_v2i16:
+; VI: s_lshr_b32 s1, s0, 16
+; VI: s_add_i32 s1, s1, 1
+; VI: s_add_i32 s0, s0, 1
+; VI: s_and_b32 s0, s0, 0xffff
+; VI: s_lshl_b32 s1, s1, 16
+; VI: s_or_b32 s0, s0, s1
+; VI: v_mov_b32_e32 v0, s0
+
+; SI: s_lshl_b32 s1, s1, 16
+; SI: s_add_i32 s0, s0, 1
+; SI: s_add_i32 s1, s1, 0x10000
+; SI: s_and_b32 s0, s0, 0xffff
+; SI: s_or_b32 s0, s0, s1
+define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) {
+ %add = add <2 x i16> %arg0, <i16 1, i16 1>
+ store <2 x i16> %add, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
attributes #0 = { nounwind noinline }
More information about the llvm-commits
mailing list