[llvm] 290e572 - [AMDGPU] Improve clobbering checks in the kernel argument promotion

Thu Feb 10 14:52:02 PST 2022

Author: Stanislav Mekhanoshin
Date: 2022-02-10T14:51:47-08:00
New Revision: 290e5722e83e9c7480d64c049a14b74e30b6af4a

URL: https://github.com/llvm/llvm-project/commit/290e5722e83e9c7480d64c049a14b74e30b6af4a
DIFF: https://github.com/llvm/llvm-project/commit/290e5722e83e9c7480d64c049a14b74e30b6af4a.diff

LOG: [AMDGPU] Improve clobbering checks in the kernel argument promotion

Use same MSSA clobbering checks as in the AMDGPUAnnotateUniformValues.
Kernel argument promotion needs exactly the same information so factor
out utility function isClobberedInFunction.

Differential Revision: https://reviews.llvm.org/D119480

Added: 
    llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
    llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
    llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
    llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index dbbc478291b5d..440c527addff9 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -14,12 +14,11 @@
 
 #include "AMDGPU.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/SmallSet.h"
+#include "Utils/AMDGPUMemoryUtils.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/InitializePasses.h"
 
 #define DEBUG_TYPE "amdgpu-annotate-uniform"
@@ -53,7 +52,6 @@ class AMDGPUAnnotateUniformValues : public FunctionPass,
 
   void visitBranchInst(BranchInst &I);
   void visitLoadInst(LoadInst &I);
-  bool isClobberedInFunction(LoadInst * Load);
 };
 
 } // End anonymous namespace
@@ -75,81 +73,6 @@ static void setNoClobberMetadata(Instruction *I) {
   I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {}));
 }
 
-bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst *Load) {
-  MemorySSAWalker *Walker = MSSA->getWalker();
-  SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)};
-  SmallSet<MemoryAccess *, 8> Visited;
-  MemoryLocation Loc(MemoryLocation::get(Load));
-
-  const auto isReallyAClobber = [this, Load](MemoryDef *Def) -> bool {
-    Instruction *DefInst = Def->getMemoryInst();
-    LLVM_DEBUG(dbgs() << "  Def: " << *DefInst << '\n');
-
-    if (isa<FenceInst>(DefInst))
-      return false;
-
-    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
-      switch (II->getIntrinsicID()) {
-      case Intrinsic::amdgcn_s_barrier:
-      case Intrinsic::amdgcn_wave_barrier:
-        return false;
-      default:
-        break;
-      }
-    }
-
-    // Ignore atomics not aliasing with the original load, any atomic is a
-    // universal MemoryDef from MSSA's point of view too, just like a fence.
-    const auto checkNoAlias = [this, Load](auto I) -> bool {
-      return I && AA->isNoAlias(I->getPointerOperand(),
-                                Load->getPointerOperand());
-    };
-
-    if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) ||
-        checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst)))
-      return false;
-
-    return true;
-  };
-
-  LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');
-
-  // Start with a nearest dominating clobbering access, it will be either
-  // live on entry (nothing to do, load is not clobbered), MemoryDef, or
-  // MemoryPhi if several MemoryDefs can define this memory state. In that
-  // case add all Defs to WorkList and continue going up and checking all
-  // the definitions of this memory location until the root. When all the
-  // defs are exhausted and came to the entry state we have no clobber.
-  // Along the scan ignore barriers and fences which are considered clobbers
-  // by the MemorySSA, but not really writing anything into the memory.
-  while (!WorkList.empty()) {
-    MemoryAccess *MA = WorkList.pop_back_val();
-    if (!Visited.insert(MA).second)
-      continue;
-
-    if (MSSA->isLiveOnEntryDef(MA))
-      continue;
-
-    if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) {
-      if (isReallyAClobber(Def)) {
-        LLVM_DEBUG(dbgs() << "      -> load is clobbered\n");
-        return true;
-      }
-
-      WorkList.push_back(
-          Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc));
-      continue;
-    }
-
-    const MemoryPhi *Phi = cast<MemoryPhi>(MA);
-    for (auto &Use : Phi->incoming_values())
-      WorkList.push_back(cast<MemoryAccess>(&Use));
-  }
-
-  LLVM_DEBUG(dbgs() << "      -> no clobber\n");
-  return false;
-}
-
 void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
   if (DA->isUniform(&I))
     setUniformMetadata(&I);
@@ -169,8 +92,7 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
   if (!isEntryFunc)
     return;
   bool GlobalLoad = I.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
-  bool NotClobbered = GlobalLoad && !isClobberedInFunction(&I);
-  if (NotClobbered)
+  if (GlobalLoad && !AMDGPU::isClobberedInFunction(&I, MSSA, AA))
     setNoClobberMetadata(&I);
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
index 01d03d17ec47e..b9b48290dd277 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
@@ -16,7 +16,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "Utils/AMDGPUMemoryUtils.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/InitializePasses.h"
@@ -30,6 +32,8 @@ namespace {
 class AMDGPUPromoteKernelArguments : public FunctionPass {
   MemorySSA *MSSA;
 
+  AliasAnalysis *AA;
+
   Instruction *ArgCastInsertPt;
 
   SmallVector<Value *> Ptrs;
@@ -43,11 +47,12 @@ class AMDGPUPromoteKernelArguments : public FunctionPass {
 
   AMDGPUPromoteKernelArguments() : FunctionPass(ID) {}
 
-  bool run(Function &F, MemorySSA &MSSA);
+  bool run(Function &F, MemorySSA &MSSA, AliasAnalysis &AA);
 
   bool runOnFunction(Function &F) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
     AU.addRequired<MemorySSAWrapperPass>();
     AU.setPreservesAll();
   }
@@ -75,9 +80,8 @@ void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) {
            PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) ||
           LD->getPointerOperand()->stripInBoundsOffsets() != Ptr)
         break;
-      const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(LD);
       // TODO: This load poprobably can be promoted to constant address space.
-      if (MSSA->isLiveOnEntryDef(MA))
+      if (!AMDGPU::isClobberedInFunction(LD, MSSA, AA))
         Ptrs.push_back(LD);
       break;
     }
@@ -131,7 +135,8 @@ static BasicBlock::iterator getInsertPt(BasicBlock &BB) {
   return InsPt;
 }
 
-bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA) {
+bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA,
+                                       AliasAnalysis &AA) {
   if (skipFunction(F))
     return false;
 
@@ -141,6 +146,7 @@ bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA) {
 
   ArgCastInsertPt = &*getInsertPt(*F.begin());
   this->MSSA = &MSSA;
+  this->AA = &AA;
 
   for (Argument &Arg : F.args()) {
     if (Arg.use_empty())
@@ -166,11 +172,13 @@ bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA) {
 
 bool AMDGPUPromoteKernelArguments::runOnFunction(Function &F) {
   MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
-  return run(F, MSSA);
+  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  return run(F, MSSA, AA);
 }
 
 INITIALIZE_PASS_BEGIN(AMDGPUPromoteKernelArguments, DEBUG_TYPE,
                       "AMDGPU Promote Kernel Arguments", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_END(AMDGPUPromoteKernelArguments, DEBUG_TYPE,
                     "AMDGPU Promote Kernel Arguments", false, false)
@@ -185,7 +193,8 @@ PreservedAnalyses
 AMDGPUPromoteKernelArgumentsPass::run(Function &F,
                                       FunctionAnalysisManager &AM) {
   MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
-  if (AMDGPUPromoteKernelArguments().run(F, MSSA)) {
+  AliasAnalysis &AA = AM.getResult<AAManager>(F);
+  if (AMDGPUPromoteKernelArguments().run(F, MSSA, AA)) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
     PA.preserve<MemorySSAAnalysis>();

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
new file mode 100644
index 0000000000000..d3848c3cb4876
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -0,0 +1,104 @@
+//===-- AMDGPUMemoryUtils.cpp - -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMemoryUtils.h"
+#include "AMDGPU.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicInst.h"
+
+#define DEBUG_TYPE "amdgpu-memory-utils"
+
+using namespace llvm;
+
+namespace llvm {
+
+namespace AMDGPU {
+
+bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
+  Instruction *DefInst = Def->getMemoryInst();
+
+  if (isa<FenceInst>(DefInst))
+    return false;
+
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::amdgcn_s_barrier:
+    case Intrinsic::amdgcn_wave_barrier:
+      return false;
+    default:
+      break;
+    }
+  }
+
+  // Ignore atomics not aliasing with the original load, any atomic is a
+  // universal MemoryDef from MSSA's point of view too, just like a fence.
+  const auto checkNoAlias = [AA, Ptr](auto I) -> bool {
+    return I && AA->isNoAlias(I->getPointerOperand(), Ptr);
+  };
+
+  if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) ||
+      checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst)))
+    return false;
+
+  return true;
+}
+
+bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
+                           AAResults *AA) {
+  MemorySSAWalker *Walker = MSSA->getWalker();
+  SmallVector<MemoryAccess *> WorkList{Walker->getClobberingMemoryAccess(Load)};
+  SmallSet<MemoryAccess *, 8> Visited;
+  MemoryLocation Loc(MemoryLocation::get(Load));
+
+  LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');
+
+  // Start with a nearest dominating clobbering access, it will be either
+  // live on entry (nothing to do, load is not clobbered), MemoryDef, or
+  // MemoryPhi if several MemoryDefs can define this memory state. In that
+  // case add all Defs to WorkList and continue going up and checking all
+  // the definitions of this memory location until the root. When all the
+  // defs are exhausted and came to the entry state we have no clobber.
+  // Along the scan ignore barriers and fences which are considered clobbers
+  // by the MemorySSA, but not really writing anything into the memory.
+  while (!WorkList.empty()) {
+    MemoryAccess *MA = WorkList.pop_back_val();
+    if (!Visited.insert(MA).second)
+      continue;
+
+    if (MSSA->isLiveOnEntryDef(MA))
+      continue;
+
+    if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) {
+      LLVM_DEBUG(dbgs() << "  Def: " << *Def->getMemoryInst() << '\n');
+
+      if (isReallyAClobber(Load->getPointerOperand(), Def, AA)) {
+        LLVM_DEBUG(dbgs() << "      -> load is clobbered\n");
+        return true;
+      }
+
+      WorkList.push_back(
+          Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc));
+      continue;
+    }
+
+    const MemoryPhi *Phi = cast<MemoryPhi>(MA);
+    for (auto &Use : Phi->incoming_values())
+      WorkList.push_back(cast<MemoryAccess>(&Use));
+  }
+
+  LLVM_DEBUG(dbgs() << "      -> no clobber\n");
+  return false;
+}
+
+} // end namespace AMDGPU
+
+} // end namespace llvm

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
new file mode 100644
index 0000000000000..97fcbfc8347da
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
@@ -0,0 +1,35 @@
+//===- AMDGPUMemoryUtils.h - Memory related helper functions -*- C++ -*----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
+
+namespace llvm {
+
+class AAResults;
+class LoadInst;
+class MemoryDef;
+class MemorySSA;
+class Value;
+
+namespace AMDGPU {
+
+/// Given a \p Def clobbering a load from \p Ptr accroding to the MSSA check
+/// if this is actually a memory update or an artifical clobber to facilitate
+/// ordering constraints.
+bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA);
+
+/// Check is a \p Load is clobbered in its function.
+bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA,
+                           AAResults *AA);
+
+} // end namespace AMDGPU
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H

diff  --git a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
index a9f9d0e7209b9..5db3da99e18de 100644
--- a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_component_library(LLVMAMDGPUUtils
   AMDGPUAsmUtils.cpp
   AMDGPUBaseInfo.cpp
   AMDGPULDSUtils.cpp
+  AMDGPUMemoryUtils.cpp
   AMDGPUPALMetadata.cpp
   AMDKernelCodeTUtils.cpp
 

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll b/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
index b7eb47aeaee4b..82ca6a8b3f644 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
@@ -314,4 +314,31 @@ entry:
   ret void
 }
 
+; GCN-LABEL: ptr_nest_3_barrier:
+; GCN-COUNT-2: global_load_dwordx2
+; GCN:         global_store_dword
+define amdgpu_kernel void @ptr_nest_3_barrier(float** addrspace(1)* nocapture readonly %Arg) {
+; CHECK-LABEL: @ptr_nest_3_barrier(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float**, float** addrspace(1)* [[ARG:%.*]], i32 [[I]]
+; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    [[P2:%.*]] = load float**, float** addrspace(1)* [[P1]], align 8
+; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast float** [[P2]] to float* addrspace(1)*
+; CHECK-NEXT:    [[P3:%.*]] = load float*, float* addrspace(1)* [[P2_GLOBAL]], align 8
+; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
+; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %p1 = getelementptr inbounds float**, float** addrspace(1)* %Arg, i32 %i
+  tail call void @llvm.amdgcn.s.barrier()
+  %p2 = load float**, float** addrspace(1)* %p1, align 8
+  %p3 = load float*, float** %p2, align 8
+  store float 0.000000e+00, float* %p3, align 4
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x()
+declare void @llvm.amdgcn.s.barrier()