[llvm] bb86220 - AMDGPU: Don't handle kernarg.segment.ptr in functions
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 13 12:51:43 PDT 2020
Author: Matt Arsenault
Date: 2020-03-13T12:51:12-07:00
New Revision: bb8622094d77417c629e45fc9964d0b699019f22
URL: https://github.com/llvm/llvm-project/commit/bb8622094d77417c629e45fc9964d0b699019f22
DIFF: https://github.com/llvm/llvm-project/commit/bb8622094d77417c629e45fc9964d0b699019f22.diff
LOG: AMDGPU: Don't handle kernarg.segment.ptr in functions
Just lower this to null. Pass implicitarg.ptr in its place in the
argument list.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll
llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index ab55ee45a6b1..8aafcd40e1cf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -216,7 +216,7 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
"amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
"amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
"amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
- "amdgpu-kernarg-segment-ptr", "amdgpu-implicitarg-ptr"};
+ "amdgpu-implicitarg-ptr"};
if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
NeedQueuePtr = true;
@@ -305,11 +305,16 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
Changed = true;
} else {
bool NonKernelOnly = false;
- StringRef AttrName = intrinsicToAttrName(IID,
- NonKernelOnly, NeedQueuePtr);
- if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
- F.addFnAttr(AttrName);
- Changed = true;
+
+ if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) {
+ F.addFnAttr("amdgpu-kernarg-segment-ptr");
+ } else {
+ StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly,
+ NeedQueuePtr);
+ if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
+ F.addFnAttr(AttrName);
+ Changed = true;
+ }
}
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 99dd11c11b66..4bf92904e1ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -3747,6 +3747,14 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
return false;
}
case Intrinsic::amdgcn_kernarg_segment_ptr:
+ if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
+ B.setInstr(MI);
+ // This only makes sense to call in a kernel, so just lower to null.
+ B.buildConstant(MI.getOperand(0).getReg(), 0);
+ MI.eraseFromParent();
+ return true;
+ }
+
return legalizePreloadedArgIntrin(
MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
case Intrinsic::amdgcn_implicitarg_ptr:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2cc891a72665..c3efa23795f4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1724,8 +1724,10 @@ void SITargetLowering::allocateSpecialInputSGPRs(
if (Info.hasQueuePtr())
ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
- if (Info.hasKernargSegmentPtr())
- ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
+ // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
+ // constant offset from the kernarg segment.
+ if (Info.hasImplicitArgPtr())
+ ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
if (Info.hasDispatchID())
ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
@@ -1740,9 +1742,6 @@ void SITargetLowering::allocateSpecialInputSGPRs(
if (Info.hasWorkGroupIDZ())
ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
-
- if (Info.hasImplicitArgPtr())
- ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
}
// Allocate special inputs passed in user SGPRs.
@@ -2448,12 +2447,11 @@ void SITargetLowering::passSpecialInputs(
AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
AMDGPUFunctionArgInfo::DISPATCH_PTR,
AMDGPUFunctionArgInfo::QUEUE_PTR,
- AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
+ AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR,
AMDGPUFunctionArgInfo::DISPATCH_ID,
AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
- AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
- AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
};
for (auto InputID : InputRegs) {
@@ -5735,6 +5733,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
}
case Intrinsic::amdgcn_kernarg_segment_ptr: {
+ if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) {
+ // This only makes sense to call in a kernel, so just lower to null.
+ return DAG.getConstant(0, DL, VT);
+ }
+
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll
index 28e4684fface..e3fd488af5e0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -114,6 +114,16 @@ define amdgpu_kernel void @opencl_test_implicit_alignment_no_explicit_kernargs_r
ret void
}
+; ALL-LABEL: {{^}}func_kernarg_segment_ptr:
+; ALL: s_mov_b32 [[S_LO:s[0-9]+]], 0{{$}}
+; ALL: s_mov_b32 [[S_HI:s[0-9]+]], 0{{$}}
+; ALL: v_mov_b32_e32 v0, [[S_LO]]{{$}}
+; ALL: v_mov_b32_e32 v1, [[S_HI]]{{$}}
+define i8 addrspace(4)* @func_kernarg_segment_ptr() {
+ %ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
+ ret i8 addrspace(4)* %ptr
+}
+
declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index 82ba28526d35..9ae7cc84167b 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -231,7 +231,7 @@ define void @use_kernarg_segment_ptr() #1 {
ret void
}
-; HSA: define void @func_indirect_use_kernarg_segment_ptr() #14 {
+; HSA: define void @func_indirect_use_kernarg_segment_ptr() #11 {
define void @func_indirect_use_kernarg_segment_ptr() #1 {
call void @use_kernarg_segment_ptr()
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
index 94adaed43b23..601ed9698c61 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
@@ -66,10 +66,10 @@ define amdgpu_kernel void @kern_indirect_use_queue_ptr_addrspacecast(i32) #1 {
ret void
}
+; Not really supported in callable functions.
; GCN-LABEL: {{^}}use_kernarg_segment_ptr:
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
-; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: s_mov_b64 [[PTR:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0{{$}}
define hidden void @use_kernarg_segment_ptr() #1 {
%kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
%header_ptr = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)*
@@ -79,10 +79,6 @@ define hidden void @use_kernarg_segment_ptr() #1 {
; GCN-LABEL: {{^}}kern_indirect_use_kernarg_segment_ptr:
; GCN: enable_sgpr_kernarg_segment_ptr = 1
-; GCN-NOT: s[4:5]
-; GCN-NOT: s4
-; GCN-NOT: s5
-; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_kernarg_segment_ptr(i32) #1 {
call void @use_kernarg_segment_ptr()
ret void
@@ -437,9 +433,9 @@ define hidden void @use_every_sgpr_input() #1 {
%queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
%val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc
- %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
- %kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)*
- %val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc
+ %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
+ %implicitarg.ptr.bc = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
+ %val2 = load volatile i32, i32 addrspace(4)* %implicitarg.ptr.bc
%val3 = call i64 @llvm.amdgcn.dispatch.id()
call void asm sideeffect "; use $0", "s"(i64 %val3)
@@ -521,9 +517,9 @@ define hidden void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 {
%queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
%val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc
- %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
- %kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)*
- %val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc
+ %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
+ %implicitarg.ptr.bc = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
+ %val2 = load volatile i32, i32 addrspace(4)* %implicitarg.ptr.bc
%val3 = call i64 @llvm.amdgcn.dispatch.id()
call void asm sideeffect "; use $0", "s"(i64 %val3)
@@ -590,9 +586,9 @@ define hidden void @func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill()
%queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
%val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc
- %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
- %kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)*
- %val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc
+ %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
+ %implicitarg.ptr.bc = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
+ %val2 = load volatile i32, i32 addrspace(4)* %implicitarg.ptr.bc
%val3 = call i64 @llvm.amdgcn.dispatch.id()
call void asm sideeffect "; use $0", "s"(i64 %val3)
@@ -614,6 +610,7 @@ declare i32 @llvm.amdgcn.workgroup.id.y() #0
declare i32 @llvm.amdgcn.workgroup.id.z() #0
declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
declare noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
+declare noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
declare i64 @llvm.amdgcn.dispatch.id() #0
declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
index 0589da71fd62..851c3aa8ef49 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
@@ -165,18 +165,13 @@ define void @opencl_func_call_implicitarg_ptr_func() #0 {
; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr:
; GCN: s_waitcnt
-; MESA-DAG: v_mov_b32_e32 v0, s4
-; MESA-DAG: v_mov_b32_e32 v1, s5
-; MESA-DAG: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; MESA: v_mov_b32_e32 v0, s6
-; MESA: v_mov_b32_e32 v1, s7
+; GCN-DAG: v_mov_b32_e32 v0, s4
+; GCN-DAG: v_mov_b32_e32 v1, s5
+; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
+; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
+
; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; HSA: v_mov_b32_e32 v0, s4
-; HSA: v_mov_b32_e32 v1, s5
-; HSA: flat_load_dword v0, v[0:1]
-; HSA: v_mov_b32_e32 v0, s6
-; HSA: v_mov_b32_e32 v1, s7
; HSA: flat_load_dword v0, v[0:1]
; GCN: s_waitcnt vmcnt(0)
@@ -192,20 +187,12 @@ define void @func_kernarg_implicitarg_ptr() #0 {
; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr:
; GCN: s_waitcnt
-; MESA-DAG: v_mov_b32_e32 v0, s4
-; MESA-DAG: v_mov_b32_e32 v1, s5
-; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; MESA-DAG: v_mov_b32_e32 v0, s6
-; MESA-DAG: v_mov_b32_e32 v1, s7
-; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-
+; GCN-DAG: v_mov_b32_e32 v0, s4
+; GCN-DAG: v_mov_b32_e32 v1, s5
+; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
+; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
-; HSA: v_mov_b32_e32 v0, s4
-; HSA: v_mov_b32_e32 v1, s5
-; HSA: flat_load_dword v0, v[0:1]
-
-; HSA: v_mov_b32_e32 v0, s6
-; HSA: v_mov_b32_e32 v1, s7
+; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; HSA: flat_load_dword v0, v[0:1]
; GCN: s_waitcnt vmcnt(0)
@@ -220,8 +207,8 @@ define void @opencl_func_kernarg_implicitarg_ptr() #0 {
}
; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func:
-; GCN: s_add_u32 s6, s4, 0x70
-; GCN: s_addc_u32 s7, s5, 0
+; GCN: s_add_u32 s4, s4, 0x70
+; GCN: s_addc_u32 s5, s5, 0
; GCN: s_swappc_b64
define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 {
call void @func_kernarg_implicitarg_ptr()
More information about the llvm-commits
mailing list