[llvm] AMDGPU/NewPM: Port SILoadStoreOptimizer to NPM (PR #106191)
Akshat Oke via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 26 23:57:26 PDT 2024
https://github.com/Akshat-Oke created https://github.com/llvm/llvm-project/pull/106191
None
>From 58e02987bd2ae6ad82c4defe4532ffd300a97343 Mon Sep 17 00:00:00 2001
From: Akshat Oke <akshaoke at amd.com>
Date: Mon, 26 Aug 2024 15:34:06 +0530
Subject: [PATCH] AMDGPU/NewPM: Port SILoadStoreOptimizer to NPM
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 6 +-
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 20 +-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 +-
.../Target/AMDGPU/SILoadStoreOptimizer.cpp | 286 ++++++++++--------
llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.h | 77 +++++
.../CodeGen/AMDGPU/load-store-opt-dlc.mir | 1 +
.../CodeGen/AMDGPU/load-store-opt-scc.mir | 1 +
7 files changed, 256 insertions(+), 140 deletions(-)
create mode 100644 llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.h
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index c50474893eb7d5..3ed0a5eb98c408 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -40,7 +40,7 @@ FunctionPass *createSIPeepholeSDWAPass();
FunctionPass *createSILowerI1CopiesLegacyPass();
FunctionPass *createAMDGPUGlobalISelDivergenceLoweringPass();
FunctionPass *createSIShrinkInstructionsPass();
-FunctionPass *createSILoadStoreOptimizerPass();
+FunctionPass *createSILoadStoreOptimizerLegacyPass();
FunctionPass *createSIWholeQuadModePass();
FunctionPass *createSIFixControlFlowLiveIntervalsPass();
FunctionPass *createSIOptimizeExecMaskingPreRAPass();
@@ -190,8 +190,8 @@ extern char &AMDGPUMarkLastScratchLoadID;
void initializeSILowerSGPRSpillsPass(PassRegistry &);
extern char &SILowerSGPRSpillsID;
-void initializeSILoadStoreOptimizerPass(PassRegistry &);
-extern char &SILoadStoreOptimizerID;
+void initializeSILoadStoreOptimizerLegacyPass(PassRegistry &);
+extern char &SILoadStoreOptimizerLegacyID;
void initializeSIWholeQuadModePass(PassRegistry &);
extern char &SIWholeQuadModeID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index d8741b4b06a984..1022f22d01e222 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -22,9 +22,9 @@ MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
-MODULE_PASS("amdgpu-perf-hint",
- AMDGPUPerfHintAnalysisPass(
- *static_cast<const GCNTargetMachine *>(this)))
+MODULE_PASS(
+ "amdgpu-perf-hint",
+ AMDGPUPerfHintAnalysisPass(*static_cast<const GCNTargetMachine *>(this)))
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
#undef MODULE_PASS
@@ -47,9 +47,9 @@ FUNCTION_PASS("amdgpu-annotate-uniform", AMDGPUAnnotateUniformValuesPass())
FUNCTION_PASS("amdgpu-codegenprepare", AMDGPUCodeGenPreparePass(*this))
FUNCTION_PASS("amdgpu-image-intrinsic-opt",
AMDGPUImageIntrinsicOptimizerPass(*this))
-FUNCTION_PASS("amdgpu-late-codegenprepare",
- AMDGPULateCodeGenPreparePass(
- *static_cast<const GCNTargetMachine *>(this)))
+FUNCTION_PASS(
+ "amdgpu-late-codegenprepare",
+ AMDGPULateCodeGenPreparePass(*static_cast<const GCNTargetMachine *>(this)))
FUNCTION_PASS("amdgpu-lower-kernel-arguments",
AMDGPULowerKernelArgumentsPass(*this))
FUNCTION_PASS("amdgpu-lower-kernel-attributes",
@@ -64,7 +64,9 @@ FUNCTION_PASS("amdgpu-rewrite-undef-for-phi", AMDGPURewriteUndefForPHIPass())
FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
AMDGPUUnifyDivergentExitNodesPass())
FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
-FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
+FUNCTION_PASS(
+ "si-annotate-control-flow",
+ SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
#undef FUNCTION_PASS
#ifndef FUNCTION_ANALYSIS
@@ -83,8 +85,7 @@ FUNCTION_ALIAS_ANALYSIS("amdgpu-aa", AMDGPUAA())
#define FUNCTION_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)
#endif
FUNCTION_PASS_WITH_PARAMS(
- "amdgpu-atomic-optimizer",
- "AMDGPUAtomicOptimizerPass",
+ "amdgpu-atomic-optimizer", "AMDGPUAtomicOptimizerPass",
[=](ScanOptions Strategy) {
return AMDGPUAtomicOptimizerPass(*this, Strategy);
},
@@ -97,4 +98,5 @@ FUNCTION_PASS_WITH_PARAMS(
MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this))
MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass())
MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
+MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass())
#undef MACHINE_FUNCTION_PASS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 570f089e914699..2834fc9e6b4923 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -34,6 +34,7 @@
#include "R600.h"
#include "R600TargetMachine.h"
#include "SIFixSGPRCopies.h"
+#include "SILoadStoreOptimizer.h"
#include "SIMachineFunctionInfo.h"
#include "SIMachineScheduler.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
@@ -409,7 +410,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSIShrinkInstructionsPass(*PR);
initializeSIOptimizeExecMaskingPreRAPass(*PR);
initializeSIOptimizeVGPRLiveRangePass(*PR);
- initializeSILoadStoreOptimizerPass(*PR);
+ initializeSILoadStoreOptimizerLegacyPass(*PR);
initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
initializeAMDGPUAlwaysInlinePass(*PR);
initializeAMDGPUSwLowerLDSLegacyPass(*PR);
@@ -1261,7 +1262,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
addPass(&SIFoldOperandsID);
if (EnableDPPCombine)
addPass(&GCNDPPCombineID);
- addPass(&SILoadStoreOptimizerID);
+ addPass(&SILoadStoreOptimizerLegacyID);
if (isPassEnabled(EnableSDWAPeephole)) {
addPass(&SIPeepholeSDWAID);
addPass(&EarlyMachineLICMID);
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index ddce80b2ae129e..1ec40d5f1b3817 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -57,6 +57,7 @@
//
//===----------------------------------------------------------------------===//
+#include "SILoadStoreOptimizer.h"
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -104,7 +105,7 @@ struct AddressRegs {
// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
const unsigned MaxAddressRegs = 12 + 1 + 1;
-class SILoadStoreOptimizer : public MachineFunctionPass {
+class SILoadStoreOptimizer {
struct CombineInfo {
MachineBasicBlock::iterator I;
unsigned EltSize;
@@ -142,7 +143,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
// with vectors of pointers.
if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
- return false;
+ return false;
}
}
return true;
@@ -155,7 +156,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
if (AddrOp->isImm())
continue;
- // Don't try to merge addresses that aren't either immediates or registers.
+ // Don't try to merge addresses that aren't either immediates or
+ // registers.
// TODO: Should be possible to merge FrameIndexes and maybe some other
// non-register
if (!AddrOp->isReg())
@@ -178,7 +180,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
// Compare by pointer order.
- bool operator<(const CombineInfo& Other) const {
+ bool operator<(const CombineInfo &Other) const {
return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
}
};
@@ -209,8 +211,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
const DenseSet<Register> &ARegUses,
const MachineInstr &A, const MachineInstr &B) const;
- static bool dmasksCanBeCombined(const CombineInfo &CI,
- const SIInstrInfo &TII,
+ static bool dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII,
const CombineInfo &Paired);
static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
CombineInfo &Paired, bool Modify = false);
@@ -274,15 +275,17 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
- void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
+ void processBaseWithConstOffset(const MachineOperand &Base,
+ MemAddress &Addr) const;
/// Promotes constant offset to the immediate by adjusting the base. It
/// tries to use a base from the nearby instructions that allows it to have
/// a 13bit constant offset which gets promoted to the immediate.
- bool promoteConstantOffsetToImm(MachineInstr &CI,
- MemInfoMap &Visited,
- SmallPtrSet<MachineInstr *, 4> &Promoted) const;
- void addInstToMergeableList(const CombineInfo &CI,
- std::list<std::list<CombineInfo> > &MergeableInsts) const;
+ bool
+ promoteConstantOffsetToImm(MachineInstr &CI, MemInfoMap &Visited,
+ SmallPtrSet<MachineInstr *, 4> &Promoted) const;
+ void addInstToMergeableList(
+ const CombineInfo &CI,
+ std::list<std::list<CombineInfo>> &MergeableInsts) const;
std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
@@ -295,16 +298,21 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
static InstClassEnum getCommonInstClass(const CombineInfo &CI,
const CombineInfo &Paired);
+ bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
+ bool &OptimizeListAgain);
+ bool optimizeBlock(std::list<std::list<CombineInfo>> &MergeableInsts);
+
public:
- static char ID;
+ SILoadStoreOptimizer(AliasAnalysis *AA) : AA(AA) {}
- SILoadStoreOptimizer() : MachineFunctionPass(ID) {
- initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
- }
+ bool run(MachineFunction &MF);
+};
- bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
- bool &OptimizeListAgain);
- bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
+class SILoadStoreOptimizerLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+
+ SILoadStoreOptimizerLegacy() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -318,8 +326,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
}
MachineFunctionProperties getRequiredProperties() const override {
- return MachineFunctionProperties()
- .set(MachineFunctionProperties::Property::IsSSA);
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::IsSSA);
}
};
@@ -807,14 +815,13 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
switch (InstClass) {
case DS_READ:
- EltSize =
- (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
- : 4;
- break;
- case DS_WRITE:
EltSize =
- (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
- : 4;
+ (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
+ break;
+ case DS_WRITE:
+ EltSize = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9)
+ ? 8
+ : 4;
break;
case S_BUFFER_LOAD_IMM:
case S_BUFFER_LOAD_SGPR_IMM:
@@ -882,18 +889,18 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
} // end anonymous namespace.
-INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
"SI Load Store Optimizer", false, false)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
- false, false)
+INITIALIZE_PASS_END(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
+ "SI Load Store Optimizer", false, false)
-char SILoadStoreOptimizer::ID = 0;
+char SILoadStoreOptimizerLegacy::ID = 0;
-char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
+char &llvm::SILoadStoreOptimizerLegacyID = SILoadStoreOptimizerLegacy::ID;
-FunctionPass *llvm::createSILoadStoreOptimizerPass() {
- return new SILoadStoreOptimizer();
+FunctionPass *llvm::createSILoadStoreOptimizerLegacyPass() {
+ return new SILoadStoreOptimizerLegacy();
}
static void addDefsUsesToList(const MachineInstr &MI,
@@ -963,9 +970,9 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
return false;
// Check other optional immediate operands for equality.
- unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
+ unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
AMDGPU::OpName::unorm, AMDGPU::OpName::da,
- AMDGPU::OpName::r128, AMDGPU::OpName::a16};
+ AMDGPU::OpName::r128, AMDGPU::OpName::a16};
for (auto op : OperandsToMatch) {
int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
@@ -991,8 +998,8 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
}
static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
- unsigned ComponentCount,
- const GCNSubtarget &STI) {
+ unsigned ComponentCount,
+ const GCNSubtarget &STI) {
if (ComponentCount > 4)
return 0;
@@ -1060,7 +1067,8 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
if (Info0->BitsPerComp != 32)
return false;
- if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
+ if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) ==
+ 0)
return false;
}
@@ -1072,7 +1080,7 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
// Handle all non-DS instructions.
if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
if (EltOffset0 + CI.Width != EltOffset1 &&
- EltOffset1 + Paired.Width != EltOffset0)
+ EltOffset1 + Paired.Width != EltOffset0)
return false;
if (CI.CPol != Paired.CPol)
return false;
@@ -1554,12 +1562,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
MachineInstr *New =
- MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
- .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
- .addImm(MergedOffset) // offset
- .addImm(CI.CPol) // cpol
- .addImm(0) // swz
- .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
+ .addImm(MergedOffset) // offset
+ .addImm(CI.CPol) // cpol
+ .addImm(0) // swz
+ .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
@@ -1644,9 +1652,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
.addImm(std::min(CI.Offset, Paired.Offset)) // offset
- .addImm(JoinedFormat) // format
- .addImm(CI.CPol) // cpol
- .addImm(0) // swz
+ .addImm(JoinedFormat) // format
+ .addImm(CI.CPol) // cpol
+ .addImm(0) // swz
.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
CI.I->eraseFromParent();
@@ -1671,10 +1679,10 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
MIB.add(*SAddr);
MachineInstr *New =
- MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
- .addImm(std::min(CI.Offset, Paired.Offset))
- .addImm(CI.CPol)
- .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
+ .addImm(std::min(CI.Offset, Paired.Offset))
+ .addImm(CI.CPol)
+ .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
@@ -1701,10 +1709,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
MIB.add(*SAddr);
- MachineInstr *New =
- MIB.addImm(std::min(CI.Offset, Paired.Offset))
- .addImm(CI.CPol)
- .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
+ MachineInstr *New = MIB.addImm(std::min(CI.Offset, Paired.Offset))
+ .addImm(CI.CPol)
+ .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
@@ -1887,11 +1894,16 @@ SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
unsigned Idx1;
static const unsigned Idxs[5][4] = {
- {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
- {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
- {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
- {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
- {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
+ {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2,
+ AMDGPU::sub0_sub1_sub2_sub3},
+ {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3,
+ AMDGPU::sub1_sub2_sub3_sub4},
+ {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4,
+ AMDGPU::sub2_sub3_sub4_sub5},
+ {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5,
+ AMDGPU::sub3_sub4_sub5_sub6},
+ {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6,
+ AMDGPU::sub4_sub5_sub6_sub7},
};
assert(CI.Width >= 1 && CI.Width <= 4);
@@ -1954,36 +1966,35 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
if (Regs.VAddr)
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
-
// It shouldn't be possible to get this far if the two instructions
// don't have a single memoperand, because MachineInstr::mayAlias()
// will return true if this is the case.
assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
MachineInstr *New =
- MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
- .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
- .addImm(std::min(CI.Offset, Paired.Offset)) // offset
- .addImm(CI.CPol) // cpol
- .addImm(0) // swz
- .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
+ .addImm(std::min(CI.Offset, Paired.Offset)) // offset
+ .addImm(CI.CPol) // cpol
+ .addImm(0) // swz
+ .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
CI.I->eraseFromParent();
Paired.I->eraseFromParent();
return New;
}
-MachineOperand
-SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
+MachineOperand SILoadStoreOptimizer::createRegOrImm(int32_t Val,
+ MachineInstr &MI) const {
APInt V(32, Val, true);
if (TII->isInlineConstant(V))
return MachineOperand::CreateImm(Val);
Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
MachineInstr *Mov =
- BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
- TII->get(AMDGPU::S_MOV_B32), Reg)
- .addImm(Val);
+ BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+ TII->get(AMDGPU::S_MOV_B32), Reg)
+ .addImm(Val);
(void)Mov;
LLVM_DEBUG(dbgs() << " "; Mov->dump());
return MachineOperand::CreateReg(Reg, false);
@@ -2005,9 +2016,10 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
"Expected 32-bit Base-Register-Hi!!");
LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
- MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
+ MachineOperand OffsetLo =
+ createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
MachineOperand OffsetHi =
- createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
+ createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
Register CarryReg = MRI->createVirtualRegister(CarryRC);
@@ -2016,31 +2028,31 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MachineInstr *LoHalf =
- BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
- .addReg(CarryReg, RegState::Define)
- .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
- .add(OffsetLo)
- .addImm(0); // clamp bit
+ BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
+ .addReg(CarryReg, RegState::Define)
+ .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
+ .add(OffsetLo)
+ .addImm(0); // clamp bit
(void)LoHalf;
LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
MachineInstr *HiHalf =
- BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
- .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
- .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
- .add(OffsetHi)
- .addReg(CarryReg, RegState::Kill)
- .addImm(0); // clamp bit
+ BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
+ .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
+ .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
+ .add(OffsetHi)
+ .addReg(CarryReg, RegState::Kill)
+ .addImm(0); // clamp bit
(void)HiHalf;
LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
MachineInstr *FullBase =
- BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
- .addReg(DestSub0)
- .addImm(AMDGPU::sub0)
- .addReg(DestSub1)
- .addImm(AMDGPU::sub1);
+ BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
(void)FullBase;
LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
@@ -2083,14 +2095,14 @@ SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
// %Base:vreg_64 =
// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
-void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
- MemAddress &Addr) const {
+void SILoadStoreOptimizer::processBaseWithConstOffset(
+ const MachineOperand &Base, MemAddress &Addr) const {
if (!Base.isReg())
return;
MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
- if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
- || Def->getNumOperands() != 5)
+ if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE ||
+ Def->getNumOperands() != 5)
return;
MachineOperand BaseLo = Def->getOperand(1);
@@ -2143,8 +2155,7 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
}
bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
- MachineInstr &MI,
- MemInfoMap &Visited,
+ MachineInstr &MI, MemInfoMap &Visited,
SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
@@ -2183,7 +2194,8 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
}
LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
- << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
+ << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset
+ << "\n\n";);
// Step2: Traverse through MI's basic block and find an anchor(that has the
// same base-registers) with the highest 13bit distance from MI's offset.
@@ -2220,9 +2232,9 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
MachineBasicBlock::iterator MBBI = MI.getIterator();
++MBBI;
const SITargetLowering *TLI =
- static_cast<const SITargetLowering *>(STM->getTargetLowering());
+ static_cast<const SITargetLowering *>(STM->getTargetLowering());
- for ( ; MBBI != E; ++MBBI) {
+ for (; MBBI != E; ++MBBI) {
MachineInstr &MINext = *MBBI;
// TODO: Support finding an anchor(with same base) from store addresses or
// any other load addresses where the opcodes are different.
@@ -2231,7 +2243,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
continue;
const MachineOperand &BaseNext =
- *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
+ *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
MemAddress MAddrNext;
if (!Visited.contains(&MINext)) {
processBaseWithConstOffset(BaseNext, MAddrNext);
@@ -2263,8 +2275,8 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
if (AnchorInst) {
LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
AnchorInst->dump());
- LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
- << AnchorAddr.Offset << "\n\n");
+ LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " << AnchorAddr.Offset
+ << "\n\n");
// Instead of moving up, just re-compute anchor-instruction's base address.
Register Base = computeBase(MI, AnchorAddr);
@@ -2291,8 +2303,9 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
return false;
}
-void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
- std::list<std::list<CombineInfo> > &MergeableInsts) const {
+void SILoadStoreOptimizer::addInstToMergeableList(
+ const CombineInfo &CI,
+ std::list<std::list<CombineInfo>> &MergeableInsts) const {
for (std::list<CombineInfo> &AddrList : MergeableInsts) {
if (AddrList.front().InstClass == CI.InstClass &&
AddrList.front().IsAGPR == CI.IsAGPR &&
@@ -2313,14 +2326,15 @@ SILoadStoreOptimizer::collectMergeableInsts(
std::list<std::list<CombineInfo>> &MergeableInsts) const {
bool Modified = false;
- // Sort potential mergeable instructions into lists. One list per base address.
+ // Sort potential mergeable instructions into lists. One list per base
+ // address.
unsigned Order = 0;
MachineBasicBlock::iterator BlockI = Begin;
for (; BlockI != End; ++BlockI) {
MachineInstr &MI = *BlockI;
- // We run this before checking if an address is mergeable, because it can produce
- // better code even if the instructions aren't mergeable.
+ // We run this before checking if an address is mergeable, because it can
+ // produce better code even if the instructions aren't mergeable.
if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
Modified = true;
@@ -2369,12 +2383,13 @@ SILoadStoreOptimizer::collectMergeableInsts(
// At this point we have lists of Mergeable instructions.
//
// Part 2: Sort lists by offset and then for each CombineInfo object in the
- // list try to find an instruction that can be merged with I. If an instruction
- // is found, it is stored in the Paired field. If no instructions are found, then
- // the CombineInfo object is deleted from the list.
+ // list try to find an instruction that can be merged with I. If an
+ // instruction is found, it is stored in the Paired field. If no instructions
+ // are found, then the CombineInfo object is deleted from the list.
for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
- E = MergeableInsts.end(); I != E;) {
+ E = MergeableInsts.end();
+ I != E;) {
std::list<CombineInfo> &MergeList = *I;
if (MergeList.size() <= 1) {
@@ -2388,10 +2403,9 @@ SILoadStoreOptimizer::collectMergeableInsts(
// Sort the lists by offsets, this way mergeable instructions will be
// adjacent to each other in the list, which will make it easier to find
// matches.
- MergeList.sort(
- [] (const CombineInfo &A, const CombineInfo &B) {
- return A.Offset < B.Offset;
- });
+ MergeList.sort([](const CombineInfo &A, const CombineInfo &B) {
+ return A.Offset < B.Offset;
+ });
++I;
}
@@ -2402,11 +2416,12 @@ SILoadStoreOptimizer::collectMergeableInsts(
// the same base register. We rely on the scheduler to do the hard work of
// clustering nearby loads, and assume these are all adjacent.
bool SILoadStoreOptimizer::optimizeBlock(
- std::list<std::list<CombineInfo> > &MergeableInsts) {
+ std::list<std::list<CombineInfo>> &MergeableInsts) {
bool Modified = false;
for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
- E = MergeableInsts.end(); I != E;) {
+ E = MergeableInsts.end();
+ I != E;) {
std::list<CombineInfo> &MergeList = *I;
bool OptimizeListAgain = false;
@@ -2431,10 +2446,8 @@ bool SILoadStoreOptimizer::optimizeBlock(
return Modified;
}
-bool
-SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
- std::list<CombineInfo> &MergeList,
- bool &OptimizeListAgain) {
+bool SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
+ std::list<CombineInfo> &MergeList, bool &OptimizeListAgain) {
if (MergeList.empty())
return false;
@@ -2522,10 +2535,15 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
return Modified;
}
-bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
+bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
+ SILoadStoreOptimizer Impl(
+ &getAnalysis<AAResultsWrapperPass>().getAAResults());
+ return Impl.run(MF);
+}
+bool SILoadStoreOptimizer::run(MachineFunction &MF) {
STM = &MF.getSubtarget<GCNSubtarget>();
if (!STM->loadStoreOptEnabled())
return false;
@@ -2534,7 +2552,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
- AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ // AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
@@ -2571,3 +2589,19 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
return Modified;
}
+
+PreservedAnalyses
+SILoadStoreOptimizerPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
+ .getManager();
+ AAResults &AA = FAM.getResult<AAManager>(MF.getFunction());
+ SILoadStoreOptimizer Impl(&AA);
+ bool Changed = Impl.run(MF);
+ if (!Changed) {
+ return PreservedAnalyses::all();
+ }
+ PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.h b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.h
new file mode 100644
index 00000000000000..fc1becb07d7276
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.h
@@ -0,0 +1,77 @@
+//===--- SILoadStoreOptimizer.h -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to fuse DS instructions with close by immediate offsets.
+// This will fuse operations such as
+// ds_read_b32 v0, v2 offset:16
+// ds_read_b32 v1, v2 offset:32
+// ==>
+// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
+//
+// The same is done for certain SMEM and VMEM opcodes, e.g.:
+// s_buffer_load_dword s4, s[0:3], 4
+// s_buffer_load_dword s5, s[0:3], 8
+// ==>
+// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
+//
+// This pass also tries to promote constant offset to the immediate by
+// adjusting the base. It tries to use a base from the nearby instructions that
+// allows it to have a 13bit constant offset and then promotes the 13bit offset
+// to the immediate.
+// E.g.
+// s_movk_i32 s0, 0x1800
+// v_add_co_u32_e32 v0, vcc, s0, v2
+// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
+//
+// s_movk_i32 s0, 0x1000
+// v_add_co_u32_e32 v5, vcc, s0, v2
+// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+// global_load_dwordx2 v[5:6], v[5:6], off
+// global_load_dwordx2 v[0:1], v[0:1], off
+// =>
+// s_movk_i32 s0, 0x1000
+// v_add_co_u32_e32 v5, vcc, s0, v2
+// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+// global_load_dwordx2 v[5:6], v[5:6], off
+// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
+//
+// Future improvements:
+//
+// - This is currently missing stores of constants because loading
+// the constant into the data register is placed between the stores, although
+// this is arguably a scheduling problem.
+//
+// - Live interval recomputing seems inefficient. This currently only matches
+// one pair, and recomputes live intervals and moves on to the next pair. It
+// would be better to compute a list of all merges that need to occur.
+//
+// - With a list of instructions to process, we can also merge more. If a
+// cluster of loads have offsets that are too large to fit in the 8-bit
+// offsets, but are close enough to fit in the 8 bits, we can add to the base
+// pointer and use the new reduced offsets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SILOADSTOREOPTIMIZER_H
+#define LLVM_LIB_TARGET_AMDGPU_SILOADSTOREOPTIMIZER_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class SILoadStoreOptimizerPass
+ : public PassInfoMixin<SILoadStoreOptimizerPass> {
+public:
+ SILoadStoreOptimizerPass() = default;
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_SILOADSTOREOPTIMIZER_H
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/load-store-opt-dlc.mir b/llvm/test/CodeGen/AMDGPU/load-store-opt-dlc.mir
index f4cdedf9cf6eb8..9295bd59621039 100644
--- a/llvm/test/CodeGen/AMDGPU/load-store-opt-dlc.mir
+++ b/llvm/test/CodeGen/AMDGPU/load-store-opt-dlc.mir
@@ -1,4 +1,5 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=si-load-store-opt -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -passes=si-load-store-opt -verify-machineinstrs -o - %s | FileCheck %s
# The purpose of this test is to make sure we are combining relevant memory
# operations correctly with/without DLC bit.
diff --git a/llvm/test/CodeGen/AMDGPU/load-store-opt-scc.mir b/llvm/test/CodeGen/AMDGPU/load-store-opt-scc.mir
index c4e131b90deb48..c0cc3e9f4edd7f 100644
--- a/llvm/test/CodeGen/AMDGPU/load-store-opt-scc.mir
+++ b/llvm/test/CodeGen/AMDGPU/load-store-opt-scc.mir
@@ -1,4 +1,5 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=si-load-store-opt -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -passes=si-load-store-opt -verify-machineinstrs -o - %s | FileCheck %s
# The purpose of this test is to make sure we are combining relevant memory
# operations correctly with/without SCC bit.
More information about the llvm-commits
mailing list