[llvm] [WIP][AMDGPU] combine uniform AMDGPU lane Intrinsics (PR #116953)

Wed Feb 19 05:00:01 PST 2025

================
@@ -0,0 +1,178 @@
+//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass simplifies certain intrinsic calls when the arguments are uniform.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+using namespace llvm::PatternMatch;
+
+namespace {
+class AMDGPUUniformIntrinsicCombineLegacy : public FunctionPass {
+public:
+  static char ID;
+  AMDGPUUniformIntrinsicCombineLegacy() : FunctionPass(ID) {
+    initializeAMDGPUUniformIntrinsicCombineLegacyPass(
+        *PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<UniformityInfoWrapperPass>();
+    AU.addRequired<TargetPassConfig>();
+  }
+};
+
+class AMDGPUUniformIntrinsicCombineImpl
+    : public InstVisitor<AMDGPUUniformIntrinsicCombineImpl> {
+private:
+  const UniformityInfo *UI;
+  bool optimizeUniformIntrinsicInst(IntrinsicInst &II) const;
+
+public:
+  AMDGPUUniformIntrinsicCombineImpl() = delete;
+  AMDGPUUniformIntrinsicCombineImpl(const UniformityInfo *UI) : UI(UI) {}
+  bool run(Function &F);
+};
+} // namespace
+
+char AMDGPUUniformIntrinsicCombineLegacy::ID = 0;
+char &llvm::AMDGPUUniformIntrinsicCombineLegacyPassID =
+    AMDGPUUniformIntrinsicCombineLegacy::ID;
+
+bool AMDGPUUniformIntrinsicCombineLegacy::runOnFunction(Function &F) {
+  if (skipFunction(F)) {
+    return false;
+  }
+  const UniformityInfo *UI =
+      &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+  return AMDGPUUniformIntrinsicCombineImpl(UI).run(F);
+}
+
+PreservedAnalyses
+AMDGPUUniformIntrinsicCombinePass::run(Function &F,
+                                       FunctionAnalysisManager &AM) {
+  const auto *UI = &AM.getResult<UniformityInfoAnalysis>(F);
+  bool IsChanged = AMDGPUUniformIntrinsicCombineImpl(UI).run(F);
+
+  if (!IsChanged) {
+    return PreservedAnalyses::all();
+  }
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  PA.preserve<UniformityInfoAnalysis>();
+  PA.preserve<TargetLibraryAnalysis>();
+  return PA;
+}
+
+bool AMDGPUUniformIntrinsicCombineImpl::run(Function &F) {
+  bool IsChanged{false};
+  // Iterate over each instruction in the function to get the desired intrinsic
+  // inst to check for optimization.
+  for (Instruction &I : instructions(F)) {
+    if (auto *Intrinsic = dyn_cast<IntrinsicInst>(&I)) {
+      IsChanged |= optimizeUniformIntrinsicInst(*Intrinsic);
+    }
+  }
+  return IsChanged;
+}
+
+bool AMDGPUUniformIntrinsicCombineImpl::optimizeUniformIntrinsicInst(
+    IntrinsicInst &II) const {
+  llvm::Intrinsic::ID IID = II.getIntrinsicID();
+
+  switch (IID) {
+  case Intrinsic::amdgcn_permlane64:
+  case Intrinsic::amdgcn_readfirstlane:
+  case Intrinsic::amdgcn_readlane: {
+    Value *Src = II.getArgOperand(0);
+    // Check if the argument use is uniform
+    if (!UI->isDivergentUse(II.getOperandUse(0))) {
+      LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << "\n");
+      II.replaceAllUsesWith(Src);
+      return true;
+    }
+    break;
+  }
+  case Intrinsic::amdgcn_ballot: {
+    Value *Src = II.getArgOperand(0);
+    // Check if the argument use is uniform and has a direct `icmp eq` use of
+    // the ballot result. If exists pull the ballot argument to the use place.
+    // %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cond)
+    // %is_done = icmp eq i64 %ballot, 0
+    // transformed IR should look like.
+    // %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cond)
+    // %is_done = icmp eq i64 %cond, 0
+    if (!UI->isDivergentUse(II.getOperandUse(0))) {
+      LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << "\n");
+
+      // Look for a direct `icmp eq` use of the ballot result.
+      auto It = llvm::find_if(II.users(), [&](User *U) {
+        return match(U, m_ICmp(m_Specific(&II), m_Zero()));
+      });
+
+      // Check if a match was found
+      if (It != II.user_end()) {
+        // Extract the matching `icmp` instruction
+        ICmpInst *ICmp = dyn_cast<ICmpInst>(*It);
----------------
jmmartinez wrote:

In the `find_if` above we're already matching against `ICmpInst`, right? You could just `cast<ICmpInst>(*It)` and avoid the "safety check".

Shouldn't we iterate over all the users that match the condition? I guess that this would be very strange since GVN would have removed the duplicates but still.

https://github.com/llvm/llvm-project/pull/116953