[llvm] [AMDGPU] New image intrinsic optimizer pass (PR #67151)

Mon Sep 25 02:51:21 PDT 2023

================
@@ -0,0 +1,336 @@
+//===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to combine multiple image_load intrinsics with dim=2dmsaa
+// or dim=2darraymsaa into a single image_msaa_load intrinsic if:
+//
+// - they refer to the same vaddr except for sample_id,
+// - they use a constant sample_id and they fall into the same group,
+// - they have the same dmask and the number of intrinsics and the number of
+//   vaddr/vdata dword transfers is reduced by the combine.
+//
+// Examples for the tradeoff (all are assuming 2DMsaa for vaddr):
+//
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? |
+// |  (dmask) |     |     |       | vdata   |            | vdata   |          |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// |        1 |   0 |   0 |     4 |  12 / 4 |          1 |   3 / 4 | yes      |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// |        1 |   0 |   0 |     2 |   6 / 2 |          1 |   3 / 4 | yes?     |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// |        2 |   0 |   0 |     4 |  12 / 8 |          2 |   6 / 8 | yes      |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// |        2 |   0 |   0 |     2 |   6 / 4 |          2 |   6 / 8 | no       |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// |        1 |   0 |   1 |     2 |   6 / 2 |          1 |   3 / 2 | yes      |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+//
+// Some cases are of questionable benefit, like the one marked with "yes?"
+// above: fewer intrinsics and fewer vaddr and fewer total transfers between SP
+// and TX, but higher vdata. We start by erring on the side of converting these
+// to MSAA_LOAD.
+//
+// This pass will combine intrinsics such as (not neccessarily consecutive):
+//  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+//  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+//  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+//  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+// ==>
+//  call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+//
+// Future improvements:
+//
+// - We may occasionally not want to do the combine if it increases the maximum
+//   register pressure.
+//
+// - Ensure clausing when multiple MSAA_LOAD are generated.
+//
+// Note: Even though the image_msaa_load intrinsic already exists on gfx10, this
+// combine only applies to gfx11, due to a limitation in gfx10: the gfx10
+// IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and
+// we don't know the format at compile time.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-image-intrinsic-opt"
+
+namespace {
+class AMDGPUImageIntrinsicOptimizer : public FunctionPass {
+  const TargetMachine *TM;
+
+public:
+  static char ID;
+
+  AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr)
+      : FunctionPass(ID), TM(TM) {}
+
+  bool runOnFunction(Function &F) override;
+
+}; // End of class AMDGPUImageIntrinsicOptimizer
+} // End anonymous namespace
+
+INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE,
+                "AMDGPU Image Intrinsic Optimizer", false, false)
+
+char AMDGPUImageIntrinsicOptimizer::ID = 0;
+
+void addInstToMergeableList(
+    IntrinsicInst *II, std::list<std::list<IntrinsicInst *>> &MergeableInsts,
+    const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {
+  for (std::list<IntrinsicInst *> &IIList : MergeableInsts) {
+    // Check Dim.
+    if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())
+      continue;
+
+    // Check D16.
+    if (IIList.front()->getType() != II->getType())
+      continue;
+
+    // Check DMask.
+    Value *DMaskList = IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex);
+    Value *DMask = II->getArgOperand(ImageDimIntr->DMaskIndex);
+    if (DMaskList != DMask)
+      continue;
+
+    // Check VAddr (except FragId).
+    int I = ImageDimIntr->VAddrStart;
+    for (; I < ImageDimIntr->VAddrEnd - 1; ++I) {
+      if (IIList.front()->getArgOperand(I) != II->getArgOperand(I))
+        break;
+    }
+
+    if (I != ImageDimIntr->VAddrEnd - 1)
+      continue;
+
+    // Check FragId group.
+    const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
+    Value *FragIdList = IIList.front()->getArgOperand(FragIdIndex);
+    auto IIListFragId = cast<ConstantInt>(FragIdList);
+    auto IIFragId = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
+    if (IIListFragId->getValue().udiv(4) != IIFragId->getValue().udiv(4))
+      continue;
+
+    // Add to the list.
+    IIList.emplace_back(II);
+    return;
+  }
+
+  // Similar instruction not found, so add a new list.
+  MergeableInsts.emplace_back(1, II);
+  LLVM_DEBUG(dbgs() << "New: " << *II << "\n");
+}
+
+// Collect list of all instructions we know how to merge in a subset of the
+// block. It returns an iterator to the instruction after the last one analyzed.
+BasicBlock::iterator
+collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E,
+                      std::list<std::list<IntrinsicInst *>> &MergeableInsts) {
+  for (; I != E; ++I) {
+    // Don't combine if there is a store in the middle or if there is a memory
+    // barrier.
+    if (I->mayHaveSideEffects()) {
+      ++I;
+      break;
+    }
+
+    // Ignore non-intrinsics.
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      Intrinsic::ID IntrinID = II->getIntrinsicID();
+
+      // Ignore other intrinsics.
+      if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&
+          IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)
+        continue;
+
+      // Check for constant FragId.
+      const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID);
+      const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
+      if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex)))
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n");
+      addInstToMergeableList(II, MergeableInsts, ImageDimIntr);
+    }
+  }
+
+  return I;
+}
+
+bool optimizeSection(std::list<std::list<IntrinsicInst *>> &MergeableInsts) {
+  bool Modified = false;
+
+  SmallVector<Instruction *, 4> InstrsToErase;
+  for (auto IIList : MergeableInsts) {
----------------
jmmartinez wrote:

This should be `auto& IIList`, otherwise it's copying each list.

https://github.com/llvm/llvm-project/pull/67151