[llvm] 813ae28 - [AMDGPU] Detect uniformness of TID / wavefrontsize
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 26 23:43:20 PDT 2022
Author: Stanislav Mekhanoshin
Date: 2022-08-26T23:26:08-07:00
New Revision: 813ae2871d71f32cce46768e63185cd64651f6e9
URL: https://github.com/llvm/llvm-project/commit/813ae2871d71f32cce46768e63185cd64651f6e9
DIFF: https://github.com/llvm/llvm-project/commit/813ae2871d71f32cce46768e63185cd64651f6e9.diff
LOG: [AMDGPU] Detect uniformness of TID / wavefrontsize
A value of 'workitemid / wavefrontize' or 'workitemid & (wavefrontize - 1)'
is wave uniform.
Differential Revision: https://reviews.llvm.org/D132511
Added:
llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 147c8850587ec..deded50afd6d3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -13,6 +13,7 @@
#include "AMDGPUISelDAGToDAG.h"
#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -2965,13 +2966,15 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
auto Ld = cast<LoadSDNode>(N);
+ if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(Ld->getMemOperand()))
+ return false;
+
return Ld->getAlign() >= Align(4) &&
- (((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
- Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
- !N->isDivergent()) ||
+ ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+ Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
(Subtarget->getScalarizeGlobalBehavior() &&
Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
- Ld->isSimple() && !N->isDivergent() &&
+ Ld->isSimple() &&
static_cast<const SITargetLowering *>(getTargetLowering())
->isMemOpHasNoClobberedMemOperand(N)));
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index ed53ce149b1b3..6a7880d39db8d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -918,6 +918,22 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
return false;
}
+ using namespace llvm::PatternMatch;
+ uint64_t C;
+ if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
+ m_ConstantInt(C))) ||
+ match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
+ m_ConstantInt(C))))
+ return C >= ST->getWavefrontSizeLog2();
+
+ Value *Mask;
+ if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
+ m_Value(Mask)))) {
+ const DataLayout &DL = cast<Instruction>(V)->getModule()->getDataLayout();
+ return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
+ ST->getWavefrontSizeLog2();
+ }
+
const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
if (!ExtValue)
return false;
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
new file mode 100644
index 0000000000000..36301d8afecfa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
@@ -0,0 +1,66 @@
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,W32 --enable-var-scope %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,W64 --enable-var-scope %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -S -amdgpu-annotate-uniform < %s | FileCheck --check-prefixes=OPT,OPT-W32 --enable-var-scope %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 -S -amdgpu-annotate-uniform < %s | FileCheck --check-prefixes=OPT,OPT-W64 --enable-var-scope %s
+
+; GCN-LABEL: {{^}}lshr_threadid:
+; W64: global_load_dword
+; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
+; W32: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]]
+
+; OPT-LABEL: @lshr_threadid
+; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform !0
+define amdgpu_kernel void @lshr_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) {
+entry:
+ %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %div = lshr i32 %lid, 5
+ %div4 = zext i32 %div to i64
+ %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+ %load = load i32, ptr addrspace(1) %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+ store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}ashr_threadid:
+; W64: global_load_dword
+; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
+; W32: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]]
+
+; OPT-LABEL: @ashr_threadid
+; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform !0
+define amdgpu_kernel void @ashr_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) {
+entry:
+ %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %div = ashr i32 %lid, 5
+ %div4 = zext i32 %div to i64
+ %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+ %load = load i32, ptr addrspace(1) %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+ store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}and_threadid:
+; W64: global_load_dword
+; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
+; W32: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]]
+
+; OPT-LABEL: @and_threadid
+; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform !0
+define amdgpu_kernel void @and_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) {
+entry:
+ %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %and = and i32 %lid, -32
+ %div4 = zext i32 %and to i64
+ %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+ %load = load i32, ptr addrspace(1) %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+ store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
More information about the llvm-commits
mailing list