[llvm] 85117e2 - AMDGPU: Fix not using scalar loads for global reads in shaders
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 2 06:49:29 PDT 2020
Author: Matt Arsenault
Date: 2020-06-02T09:49:23-04:00
New Revision: 85117e286db0bfa6c7cecadd2c0c348e3358f450
URL: https://github.com/llvm/llvm-project/commit/85117e286db0bfa6c7cecadd2c0c348e3358f450
DIFF: https://github.com/llvm/llvm-project/commit/85117e286db0bfa6c7cecadd2c0c348e3358f450.diff
LOG: AMDGPU: Fix not using scalar loads for global reads in shaders
The pass which infers when it's legal to load a global address space
as SMRD was only considering amdgpu_kernel, and ignoring the shader
entry type calling conventions.
Added:
llvm/test/CodeGen/AMDGPU/infer-uniform-load-shader.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
llvm/test/CodeGen/AMDGPU/global-saddr.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index 6fb507083cef..b09e92c07f9b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
@@ -35,7 +36,7 @@ class AMDGPUAnnotateUniformValues : public FunctionPass,
MemoryDependenceResults *MDR;
LoopInfo *LI;
DenseMap<Value*, GetElementPtrInst*> noClobberClones;
- bool isKernelFunc;
+ bool isEntryFunc;
public:
static char ID;
@@ -127,11 +128,10 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
auto isGlobalLoad = [&](LoadInst &Load)->bool {
return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
};
- // We're tracking up to the Function boundaries
- // We cannot go beyond because of FunctionPass restrictions
- // Thus we can ensure that memory not clobbered for memory
- // operations that live in kernel only.
- bool NotClobbered = isKernelFunc && !isClobberedInFunction(&I);
+ // We're tracking up to the Function boundaries, and cannot go beyond because
+ // of FunctionPass restrictions. We can ensure that is memory not clobbered
+ // for memory operations that are live in to entry points only.
+ bool NotClobbered = isEntryFunc && !isClobberedInFunction(&I);
Instruction *PtrI = dyn_cast<Instruction>(Ptr);
if (!PtrI && NotClobbered && isGlobalLoad(I)) {
if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) {
@@ -170,7 +170,7 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
DA = &getAnalysis<LegacyDivergenceAnalysis>();
MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
+ isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv());
visit(F);
noClobberClones.clear();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
index e7a77797e165..adf9fdd2fe81 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -481,7 +481,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr(float addrspace(1)* inreg %ptr) {
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: ; return to shader part epilog
- %val = load float, float addrspace(1)* %ptr
+ %val = load volatile float, float addrspace(1)* %ptr
ret float %val
}
@@ -508,7 +508,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095(float addrspace(1)* inreg
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: ; return to shader part epilog
%gep = getelementptr float, float addrspace(1)* %ptr, i64 4095
- %val = load float, float addrspace(1)* %gep
+ %val = load volatile float, float addrspace(1)* %gep
ret float %val
}
@@ -541,7 +541,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967296(float addrspace(1)*
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: ; return to shader part epilog
%gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967296
- %val = load float, float addrspace(1)* %gep
+ %val = load volatile float, float addrspace(1)* %gep
ret float %val
}
@@ -574,7 +574,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967297(float addrspace(1)*
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: ; return to shader part epilog
%gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967297
- %val = load float, float addrspace(1)* %gep
+ %val = load volatile float, float addrspace(1)* %gep
ret float %val
}
@@ -601,7 +601,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4096(float addrspace(1)* inreg
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: ; return to shader part epilog
%gep = getelementptr float, float addrspace(1)* %ptr, i64 4096
- %val = load float, float addrspace(1)* %gep
+ %val = load volatile float, float addrspace(1)* %gep
ret float %val
}
@@ -626,7 +626,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4095(float addrspace(1)* %ptr)
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: ; return to shader part epilog
%gep = getelementptr float, float addrspace(1)* %ptr, i64 4095
- %val = load float, float addrspace(1)* %gep
+ %val = load volatile float, float addrspace(1)* %gep
ret float %val
}
@@ -651,7 +651,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967296(float addrspace(1)*
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: ; return to shader part epilog
%gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967296
- %val = load float, float addrspace(1)* %gep
+ %val = load volatile float, float addrspace(1)* %gep
ret float %val
}
@@ -676,7 +676,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967297(float addrspace(1)*
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: ; return to shader part epilog
%gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967297
- %val = load float, float addrspace(1)* %gep
+ %val = load volatile float, float addrspace(1)* %gep
ret float %val
}
@@ -701,7 +701,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4096(float addrspace(1)* %ptr)
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: ; return to shader part epilog
%gep = getelementptr float, float addrspace(1)* %ptr, i64 4096
- %val = load float, float addrspace(1)* %gep
+ %val = load volatile float, float addrspace(1)* %gep
ret float %val
}
@@ -734,7 +734,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(float addrspace(1)* inre
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: ; return to shader part epilog
%gep = getelementptr float, float addrspace(1)* %ptr, i32 %soffset
- %val = load float, float addrspace(1)* %gep
+ %val = load volatile float, float addrspace(1)* %gep
ret float %val
}
@@ -759,7 +759,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset(float addrspace(1)* %ptr
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: ; return to shader part epilog
%gep = getelementptr float, float addrspace(1)* %ptr, i32 %soffset
- %val = load float, float addrspace(1)* %gep
+ %val = load volatile float, float addrspace(1)* %gep
ret float %val
}
@@ -785,7 +785,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(float addrspac
; GFX7-NEXT: ; return to shader part epilog
%gep0 = getelementptr float, float addrspace(1)* %ptr, i32 %soffset
%gep1 = getelementptr float, float addrspace(1)* %gep0, i32 256
- %val = load float, float addrspace(1)* %gep1
+ %val = load volatile float, float addrspace(1)* %gep1
ret float %val
}
@@ -823,7 +823,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspac
; GFX7-NEXT: ; return to shader part epilog
%gep0 = getelementptr float, float addrspace(1)* %ptr, i64 256
%gep1 = getelementptr float, float addrspace(1)* %gep0, i32 %soffset
- %val = load float, float addrspace(1)* %gep1
+ %val = load volatile float, float addrspace(1)* %gep1
ret float %val
}
@@ -852,7 +852,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset(float addrspace(1)* inre
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: ; return to shader part epilog
%gep = getelementptr float, float addrspace(1)* %ptr, i32 %voffset
- %val = load float, float addrspace(1)* %gep
+ %val = load volatile float, float addrspace(1)* %gep
ret float %val
}
@@ -884,7 +884,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset_offset4095(float addrspa
; GFX7-NEXT: ; return to shader part epilog
%gep0 = getelementptr float, float addrspace(1)* %ptr, i32 %voffset
%gep1 = getelementptr float, float addrspace(1)* %gep0, i64 4095
- %val = load float, float addrspace(1)* %gep1
+ %val = load volatile float, float addrspace(1)* %gep1
ret float %val
}
define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095_vgpr_offset(float addrspace(1)* inreg %ptr, i32 %voffset) {
@@ -913,7 +913,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095_vgpr_offset(float addrspa
; GFX7-NEXT: ; return to shader part epilog
%gep0 = getelementptr float, float addrspace(1)* %ptr, i64 4095
%gep1 = getelementptr float, float addrspace(1)* %gep0, i32 %voffset
- %val = load float, float addrspace(1)* %gep1
+ %val = load volatile float, float addrspace(1)* %gep1
ret float %val
}
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr.ll b/llvm/test/CodeGen/AMDGPU/global-saddr.ll
index 4897af7d506f..136cfd63686c 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr.ll
@@ -84,6 +84,7 @@ entry:
ret void
}
+; GFX9-LABEL: {{^}}_amdgpu_cs_main:
; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:16{{$}}
; GFX9-NEXT: s_waitcnt
@@ -92,7 +93,7 @@ entry:
define amdgpu_cs void @_amdgpu_cs_main(i64 inreg %arg) {
bb:
%tmp1 = inttoptr i64 %arg to <4 x i64> addrspace(1)*
- %tmp2 = load <4 x i64>, <4 x i64> addrspace(1)* %tmp1, align 16
+ %tmp2 = load volatile <4 x i64>, <4 x i64> addrspace(1)* %tmp1, align 16
store volatile <4 x i64> %tmp2, <4 x i64> addrspace(1)* undef
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/infer-uniform-load-shader.ll b/llvm/test/CodeGen/AMDGPU/infer-uniform-load-shader.ll
new file mode 100644
index 000000000000..e44feac835a1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/infer-uniform-load-shader.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
+;
+; Make sure shaders with uniform, unmodified global address space
+; loads are accessed with scalar loads.
+
+define amdgpu_ps i32 @ps_load_uniform_global_i32_align4(i32 addrspace(1)* inreg %ptr) {
+; GCN-LABEL: ps_load_uniform_global_i32_align4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s0, s[2:3], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
+ %load = load i32, i32 addrspace(1)* %ptr, align 4
+ ret i32 %load
+}
+
+define amdgpu_cs i32 @cs_load_uniform_global_i32_align4(i32 addrspace(1)* inreg %ptr) {
+; GCN-LABEL: cs_load_uniform_global_i32_align4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s0, s[2:3], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
+ %load = load i32, i32 addrspace(1)* %ptr, align 4
+ ret i32 %load
+}
More information about the llvm-commits
mailing list