[llvm] [AMDGPU] New image intrinsic optimizer pass (PR #67151)
Juan Manuel Martinez CaamaƱo via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 25 05:00:25 PDT 2023
================
@@ -0,0 +1,336 @@
+//===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to combine multiple image_load intrinsics with dim=2dmsaa
+// or dim=2darraymsaa into a single image_msaa_load intrinsic if:
+//
+// - they refer to the same vaddr except for sample_id,
+// - they use a constant sample_id and they fall into the same group,
+// - they have the same dmask and the number of intrinsics and the number of
+// vaddr/vdata dword transfers is reduced by the combine.
+//
+// Examples for the tradeoff (all are assuming 2DMsaa for vaddr):
+//
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? |
+// | (dmask) | | | | vdata | | vdata | |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// | 1 | 0 | 0 | 4 | 12 / 4 | 1 | 3 / 4 | yes |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// | 1 | 0 | 0 | 2 | 6 / 2 | 1 | 3 / 4 | yes? |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// | 2 | 0 | 0 | 4 | 12 / 8 | 2 | 6 / 8 | yes |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// | 2 | 0 | 0 | 2 | 6 / 4 | 2 | 6 / 8 | no |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// | 1 | 0 | 1 | 2 | 6 / 2 | 1 | 3 / 2 | yes |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+//
+// Some cases are of questionable benefit, like the one marked with "yes?"
+// above: fewer intrinsics and fewer vaddr and fewer total transfers between SP
+// and TX, but higher vdata. We start by erring on the side of converting these
+// to MSAA_LOAD.
+//
+// This pass will combine intrinsics such as (not neccessarily consecutive):
+// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+// ==>
+// call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+//
+// Future improvements:
+//
+// - We may occasionally not want to do the combine if it increases the maximum
+// register pressure.
+//
+// - Ensure clausing when multiple MSAA_LOAD are generated.
+//
+// Note: Even though the image_msaa_load intrinsic already exists on gfx10, this
+// combine only applies to gfx11, due to a limitation in gfx10: the gfx10
+// IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and
+// we don't know the format at compile time.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-image-intrinsic-opt"
+
+namespace {
+class AMDGPUImageIntrinsicOptimizer : public FunctionPass {
+ const TargetMachine *TM;
+
+public:
+ static char ID;
+
+ AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr)
+ : FunctionPass(ID), TM(TM) {}
+
+ bool runOnFunction(Function &F) override;
+
+}; // End of class AMDGPUImageIntrinsicOptimizer
+} // End anonymous namespace
+
+INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE,
+ "AMDGPU Image Intrinsic Optimizer", false, false)
+
+char AMDGPUImageIntrinsicOptimizer::ID = 0;
+
+void addInstToMergeableList(
+ IntrinsicInst *II, std::list<std::list<IntrinsicInst *>> &MergeableInsts,
+ const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {
+ for (std::list<IntrinsicInst *> &IIList : MergeableInsts) {
+ // Check Dim.
+ if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())
+ continue;
+
+ // Check D16.
+ if (IIList.front()->getType() != II->getType())
+ continue;
+
+ // Check DMask.
+ Value *DMaskList = IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex);
+ Value *DMask = II->getArgOperand(ImageDimIntr->DMaskIndex);
+ if (DMaskList != DMask)
+ continue;
+
+ // Check VAddr (except FragId).
+ int I = ImageDimIntr->VAddrStart;
+ for (; I < ImageDimIntr->VAddrEnd - 1; ++I) {
+ if (IIList.front()->getArgOperand(I) != II->getArgOperand(I))
+ break;
+ }
+
+ if (I != ImageDimIntr->VAddrEnd - 1)
+ continue;
+
+ // Check FragId group.
+ const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
+ Value *FragIdList = IIList.front()->getArgOperand(FragIdIndex);
+ auto IIListFragId = cast<ConstantInt>(FragIdList);
+ auto IIFragId = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
+ if (IIListFragId->getValue().udiv(4) != IIFragId->getValue().udiv(4))
+ continue;
+
+ // Add to the list.
+ IIList.emplace_back(II);
+ return;
+ }
+
+ // Similar instruction not found, so add a new list.
+ MergeableInsts.emplace_back(1, II);
+ LLVM_DEBUG(dbgs() << "New: " << *II << "\n");
+}
+
+// Collect list of all instructions we know how to merge in a subset of the
+// block. It returns an iterator to the instruction after the last one analyzed.
+BasicBlock::iterator
+collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E,
+ std::list<std::list<IntrinsicInst *>> &MergeableInsts) {
+ for (; I != E; ++I) {
+ // Don't combine if there is a store in the middle or if there is a memory
+ // barrier.
+ if (I->mayHaveSideEffects()) {
+ ++I;
+ break;
+ }
+
+ // Ignore non-intrinsics.
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ Intrinsic::ID IntrinID = II->getIntrinsicID();
+
+ // Ignore other intrinsics.
+ if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&
+ IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)
+ continue;
+
+ // Check for constant FragId.
+ const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID);
+ const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
+ if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex)))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n");
+ addInstToMergeableList(II, MergeableInsts, ImageDimIntr);
+ }
+ }
+
+ return I;
+}
+
+bool optimizeSection(std::list<std::list<IntrinsicInst *>> &MergeableInsts) {
+ bool Modified = false;
+
+ SmallVector<Instruction *, 4> InstrsToErase;
+ for (auto IIList : MergeableInsts) {
+ if (IIList.size() <= 1)
+ continue;
+
+ // Assume the arguments are unchanged and later override them, if needed.
+ SmallVector<Value *, 16> Args(IIList.front()->args());
+
+ // Validate function argument and return types, extracting overloaded
+ // types along the way.
+ SmallVector<Type *, 6> OverloadTys;
+ Function *F = IIList.front()->getCalledFunction();
+ if (!Intrinsic::getIntrinsicSignature(F, OverloadTys))
+ continue;
+
+ Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID();
+ const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+ AMDGPU::getImageDimIntrinsicInfo(IntrinID);
+
+ Type *EltTy = IIList.front()->getType()->getScalarType();
+ Type *NewTy = FixedVectorType::get(EltTy, 4);
+ OverloadTys[0] = NewTy;
+ bool isD16 = EltTy->isHalfTy();
+
+ ConstantInt *DMask = cast<ConstantInt>(
+ IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex));
+ unsigned DMaskVal = DMask->getZExtValue() & 0xf;
+ unsigned NumElts = popcount(DMaskVal);
+
+ // Number of instructions and the number of vaddr/vdata dword transfers
+ // should be reduced.
+ unsigned NumLoads = IIList.size();
+ unsigned NumMsaas = NumElts;
+ unsigned NumVAddrLoads = 3 * NumLoads;
+ unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads;
+ unsigned NumVAddrMsaas = 3 * NumMsaas;
+ unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas;
+
+ if (NumLoads < NumMsaas ||
+ (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas))
+ continue;
+
+ const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
+ auto FragId = cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex));
+ const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4;
+
+ // Create the new instructions.
+ IRBuilder<> B(IIList.front());
+
+ // Create the new image_msaa_load intrinsic.
+ SmallVector<Instruction *, 4> NewCalls;
+ while (DMaskVal != 0) {
+ unsigned NewMaskVal = 1 << countr_zero(DMaskVal);
+
+ Intrinsic::ID NewIntrinID;
+ if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa)
+ NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa;
+ else
+ NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;
+
+ Function *NewIntrin = Intrinsic::getDeclaration(
+ IIList.front()->getModule(), NewIntrinID, OverloadTys);
+ Args[ImageDimIntr->DMaskIndex] =
+ ConstantInt::get(DMask->getType(), NewMaskVal);
+ Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal);
+ CallInst *NewCall = B.CreateCall(NewIntrin, Args);
+ LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n");
+
+ NewCalls.push_back(NewCall);
+ DMaskVal -= NewMaskVal;
+ }
+
+ // Create the new extractelement instructions.
+ for (auto &II : IIList) {
+ Value *VecOp = UndefValue::get(II->getType());
+ auto Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
+ if (NumElts == 1) {
+ VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4));
+ LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
+ } else {
+ for (unsigned I = 0; I < NumElts; ++I) {
+ VecOp = B.CreateInsertElement(
+ VecOp, B.CreateExtractElement(
+ NewCalls[I], Idx->getValue().urem(4)), I);
+ LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
+ }
+ }
+
+ // Replace the old instruction.
+ II->replaceAllUsesWith(VecOp);
+ InstrsToErase.push_back(II);
+ }
+
+ Modified = true;
+ }
+
+ for (auto I : InstrsToErase) {
+ I->eraseFromParent();
+ }
+
+ MergeableInsts.clear();
+
+ return Modified;
+}
+
+static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) {
+ if (!TM)
+ return false;
+
+ // This optimization only applies to GFX11 and beyond.
+ const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+ if (!AMDGPU::isGFX11Plus(ST) || ST.hasMSAALoadDstSelBug())
+ return false;
+
+ Module *M = F.getParent();
+
+ // Early test to determine if the intrinsics are used.
+ if (std::none_of(M->begin(), M->end(), [](Function &F) {
+ return !F.users().empty() &&
+ (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa ||
+ F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);
+ }))
+ return false;
+
+ bool Modified = false;
+ for (auto &BB : F) {
+ BasicBlock::iterator SectionEnd;
+ for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;
+ I = SectionEnd) {
+ std::list<std::list<IntrinsicInst *>> MergeableInsts;
----------------
jmmartinez wrote:
I guess it depends on the workload. If the vector often goes over the stack space allocated by a `SmallVector<T>` I would use the `, 0` version.
In this case, I think specifying the `, 0` or not will be fine in either case.
Just to keep in mind, on x86_64:
* a `SmallVector<void*>` preallocates stack space for 6 elements
* a `SmallVector<SmallVector<void*>>` prealocates stack space for a single element
* a `SmallVector<SmallVector<void*, 0>>` prealocates stack space for 3 elements
https://github.com/llvm/llvm-project/pull/67151
More information about the llvm-commits
mailing list