[llvm] 813ae28 - [AMDGPU] Detect uniformness of TID / wavefrontsize

Fri Aug 26 23:43:20 PDT 2022

Author: Stanislav Mekhanoshin
Date: 2022-08-26T23:26:08-07:00
New Revision: 813ae2871d71f32cce46768e63185cd64651f6e9

URL: https://github.com/llvm/llvm-project/commit/813ae2871d71f32cce46768e63185cd64651f6e9
DIFF: https://github.com/llvm/llvm-project/commit/813ae2871d71f32cce46768e63185cd64651f6e9.diff

LOG: [AMDGPU] Detect uniformness of TID / wavefrontsize

A value of 'workitemid / wavefrontize' or 'workitemid & (wavefrontize - 1)'
is wave uniform.

Differential Revision: https://reviews.llvm.org/D132511

Added: 
    llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
    llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 147c8850587ec..deded50afd6d3 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -13,6 +13,7 @@
 
 #include "AMDGPUISelDAGToDAG.h"
 #include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -2965,13 +2966,15 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
   auto Ld = cast<LoadSDNode>(N);
 
+  if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(Ld->getMemOperand()))
+    return false;
+
   return Ld->getAlign() >= Align(4) &&
-         (((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
-            Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
-           !N->isDivergent()) ||
+         ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+           Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
           (Subtarget->getScalarizeGlobalBehavior() &&
            Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
-           Ld->isSimple() && !N->isDivergent() &&
+           Ld->isSimple() &&
            static_cast<const SITargetLowering *>(getTargetLowering())
                ->isMemOpHasNoClobberedMemOperand(N)));
 }

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index ed53ce149b1b3..6a7880d39db8d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -918,6 +918,22 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
     return false;
   }
 
+  using namespace llvm::PatternMatch;
+  uint64_t C;
+  if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
+                      m_ConstantInt(C))) ||
+      match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
+                      m_ConstantInt(C))))
+    return C >= ST->getWavefrontSizeLog2();
+
+  Value *Mask;
+  if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
+                       m_Value(Mask)))) {
+    const DataLayout &DL = cast<Instruction>(V)->getModule()->getDataLayout();
+    return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
+           ST->getWavefrontSizeLog2();
+  }
+
   const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
   if (!ExtValue)
     return false;

diff  --git a/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
new file mode 100644
index 0000000000000..36301d8afecfa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
@@ -0,0 +1,66 @@
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,W32 --enable-var-scope %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,W64 --enable-var-scope %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -S -amdgpu-annotate-uniform < %s | FileCheck --check-prefixes=OPT,OPT-W32 --enable-var-scope %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 -S -amdgpu-annotate-uniform < %s | FileCheck --check-prefixes=OPT,OPT-W64 --enable-var-scope %s
+
+; GCN-LABEL: {{^}}lshr_threadid:
+; W64: global_load_dword
+; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
+; W32: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]]
+
+; OPT-LABEL: @lshr_threadid
+; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform !0
+define amdgpu_kernel void @lshr_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out)  {
+entry:
+  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %div = lshr i32 %lid, 5
+  %div4 = zext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+  %load = load i32, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+  store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}ashr_threadid:
+; W64: global_load_dword
+; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
+; W32: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]]
+
+; OPT-LABEL: @ashr_threadid
+; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform !0
+define amdgpu_kernel void @ashr_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out)  {
+entry:
+  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %div = ashr i32 %lid, 5
+  %div4 = zext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+  %load = load i32, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+  store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}and_threadid:
+; W64: global_load_dword
+; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
+; W32: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]]
+
+; OPT-LABEL: @and_threadid
+; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform !0
+define amdgpu_kernel void @and_threadid(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out)  {
+entry:
+  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %and = and i32 %lid, -32
+  %div4 = zext i32 %and to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4
+  %load = load i32, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %div4
+  store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()