[llvm] [AMDGPU] Introduce "amdgpu-uniform-intrinsic-combine" pass to combine uniform AMDGPU lane Intrinsics. (PR #116953)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 25 05:41:10 PDT 2025
================
@@ -0,0 +1,197 @@
+//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass simplifies certain intrinsic calls when the arguments are uniform.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+using namespace llvm::PatternMatch;
+
+/// Optimizes uniform intrinsics.
+static bool optimizeUniformIntrinsic(IntrinsicInst &II,
+ const UniformityInfo *UI) {
+ llvm::Intrinsic::ID IID = II.getIntrinsicID();
+
+ switch (IID) {
+ case Intrinsic::amdgcn_permlane64:
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readlane: {
+ Value *Src = II.getArgOperand(0);
+ // Check if the argument use is divergent
+ if (UI->isDivergentUse(II.getOperandUse(0)))
+ return false;
+ LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << "\n");
+ II.replaceAllUsesWith(Src);
+ II.eraseFromParent();
+ return true;
+ }
+ case Intrinsic::amdgcn_ballot: {
+ Value *Src = II.getArgOperand(0);
+ if (UI->isDivergentUse(II.getOperandUse(0)))
+ return false;
+ LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << "\n");
+
+ // If there are no ICmp users, return early.
+ if (II.user_empty() ||
+ none_of(II.users(), [](User *U) { return isa<ICmpInst>(U); })) {
+ return false;
+ }
+
+ bool Changed = false;
+ for (User *U : make_early_inc_range(II.users())) {
+ if (auto *ICmp = dyn_cast<ICmpInst>(U)) {
+ Value *Op0 = ICmp->getOperand(0);
+ Value *Op1 = ICmp->getOperand(1);
+ ICmpInst::Predicate Pred = ICmp->getPredicate();
+
+ Value *OtherOp = (Op0 == &II ? Op1 : Op0);
+
+ // Case (icmp eq %ballot, 0) --> xor %ballot_arg, 1
+ if (Pred == ICmpInst::ICMP_EQ && match(OtherOp, m_Zero())) {
+ Instruction *NotOp =
+ BinaryOperator::CreateNot(Src, "", ICmp->getIterator());
+ LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << "\n");
+ ICmp->replaceAllUsesWith(NotOp);
+ ICmp->eraseFromParent();
+ Changed = true;
+ }
+ // (icmp ne %ballot, 0) --> %ballot_arg
+ else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
+ LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
+ << *Src << "\n");
+ ICmp->replaceAllUsesWith(Src);
+ ICmp->eraseFromParent();
+ Changed = true;
+ }
+ }
+ }
+ // Erase the intrinsic if it has no remaining uses.
+ if (II.use_empty())
+ II.eraseFromParent();
+ return Changed;
+ }
+ }
+ return false;
+}
+
+/// Iterates over the Intrinsics use in the function to optimise.
+static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo *UI) {
+ Module *M = F.getParent();
+ llvm::LLVMContext &Ctx = M->getContext();
+ // List of AMDGPU intrinsics to optimize if their arguments are uniform.
+ std::vector<Intrinsic::ID> Intrinsics = {
+ Intrinsic::amdgcn_permlane64, Intrinsic::amdgcn_readfirstlane,
+ Intrinsic::amdgcn_readlane, Intrinsic::amdgcn_ballot};
+
+ bool IsChanged = false;
+ // TODO: Vector types can also be optimized, provided generic way to query
+ // getDeclarationIfExists().
+ SmallVector<Type *, 7> Tys = {
+ Type::getInt16Ty(Ctx), // i16
+ Type::getInt32Ty(Ctx), // i32
+ Type::getInt64Ty(Ctx), // i64
+ Type::getHalfTy(Ctx), // Float16
+ Type::getFloatTy(Ctx), // float
+ Type::getDoubleTy(Ctx), // double
+ Type::getBFloatTy(Ctx) // bfloat16
+ };
+ // Iterate over each intrinsic in the list and process its uses within F.
+ for (Intrinsic::ID IID : Intrinsics) {
+ for (Type *Ty : Tys) {
----------------
jayfoad wrote:
It seems silly to iterate over an arbitrary fixed list of type here. Instead you could iterate over every function in the module and call `getIntrinsicID` on it. This should be fast.
https://github.com/llvm/llvm-project/pull/116953
More information about the llvm-commits
mailing list