[llvm] AMDGPU/NewPM: Port SILoadStoreOptimizer to NPM (PR #106191)

Mon Aug 26 23:57:26 PDT 2024

https://github.com/Akshat-Oke created https://github.com/llvm/llvm-project/pull/106191

None

>From 58e02987bd2ae6ad82c4defe4532ffd300a97343 Mon Sep 17 00:00:00 2001
From: Akshat Oke <akshaoke at amd.com>
Date: Mon, 26 Aug 2024 15:34:06 +0530
Subject: [PATCH] AMDGPU/NewPM: Port SILoadStoreOptimizer to NPM

---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |   6 +-
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |  20 +-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   5 +-
 .../Target/AMDGPU/SILoadStoreOptimizer.cpp    | 286 ++++++++++--------
 llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.h |  77 +++++
 .../CodeGen/AMDGPU/load-store-opt-dlc.mir     |   1 +
 .../CodeGen/AMDGPU/load-store-opt-scc.mir     |   1 +
 7 files changed, 256 insertions(+), 140 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.h

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index c50474893eb7d5..3ed0a5eb98c408 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -40,7 +40,7 @@ FunctionPass *createSIPeepholeSDWAPass();
 FunctionPass *createSILowerI1CopiesLegacyPass();
 FunctionPass *createAMDGPUGlobalISelDivergenceLoweringPass();
 FunctionPass *createSIShrinkInstructionsPass();
-FunctionPass *createSILoadStoreOptimizerPass();
+FunctionPass *createSILoadStoreOptimizerLegacyPass();
 FunctionPass *createSIWholeQuadModePass();
 FunctionPass *createSIFixControlFlowLiveIntervalsPass();
 FunctionPass *createSIOptimizeExecMaskingPreRAPass();
@@ -190,8 +190,8 @@ extern char &AMDGPUMarkLastScratchLoadID;
 void initializeSILowerSGPRSpillsPass(PassRegistry &);
 extern char &SILowerSGPRSpillsID;
 
-void initializeSILoadStoreOptimizerPass(PassRegistry &);
-extern char &SILoadStoreOptimizerID;
+void initializeSILoadStoreOptimizerLegacyPass(PassRegistry &);
+extern char &SILoadStoreOptimizerLegacyID;
 
 void initializeSIWholeQuadModePass(PassRegistry &);
 extern char &SIWholeQuadModeID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index d8741b4b06a984..1022f22d01e222 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -22,9 +22,9 @@ MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
 MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
 MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
 MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
-MODULE_PASS("amdgpu-perf-hint",
-            AMDGPUPerfHintAnalysisPass(
-              *static_cast<const GCNTargetMachine *>(this)))
+MODULE_PASS(
+    "amdgpu-perf-hint",
+    AMDGPUPerfHintAnalysisPass(*static_cast<const GCNTargetMachine *>(this)))
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
 #undef MODULE_PASS
@@ -47,9 +47,9 @@ FUNCTION_PASS("amdgpu-annotate-uniform", AMDGPUAnnotateUniformValuesPass())
 FUNCTION_PASS("amdgpu-codegenprepare", AMDGPUCodeGenPreparePass(*this))
 FUNCTION_PASS("amdgpu-image-intrinsic-opt",
               AMDGPUImageIntrinsicOptimizerPass(*this))
-FUNCTION_PASS("amdgpu-late-codegenprepare",
-              AMDGPULateCodeGenPreparePass(
-                *static_cast<const GCNTargetMachine *>(this)))
+FUNCTION_PASS(
+    "amdgpu-late-codegenprepare",
+    AMDGPULateCodeGenPreparePass(*static_cast<const GCNTargetMachine *>(this)))
 FUNCTION_PASS("amdgpu-lower-kernel-arguments",
               AMDGPULowerKernelArgumentsPass(*this))
 FUNCTION_PASS("amdgpu-lower-kernel-attributes",
@@ -64,7 +64,9 @@ FUNCTION_PASS("amdgpu-rewrite-undef-for-phi", AMDGPURewriteUndefForPHIPass())
 FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
               AMDGPUUnifyDivergentExitNodesPass())
 FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
-FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
+FUNCTION_PASS(
+    "si-annotate-control-flow",
+    SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
 #undef FUNCTION_PASS
 
 #ifndef FUNCTION_ANALYSIS
@@ -83,8 +85,7 @@ FUNCTION_ALIAS_ANALYSIS("amdgpu-aa", AMDGPUAA())
 #define FUNCTION_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)
 #endif
 FUNCTION_PASS_WITH_PARAMS(
-    "amdgpu-atomic-optimizer",
-    "AMDGPUAtomicOptimizerPass",
+    "amdgpu-atomic-optimizer", "AMDGPUAtomicOptimizerPass",
     [=](ScanOptions Strategy) {
       return AMDGPUAtomicOptimizerPass(*this, Strategy);
     },
@@ -97,4 +98,5 @@ FUNCTION_PASS_WITH_PARAMS(
 MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this))
 MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass())
 MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
+MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass())
 #undef MACHINE_FUNCTION_PASS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 570f089e914699..2834fc9e6b4923 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -34,6 +34,7 @@
 #include "R600.h"
 #include "R600TargetMachine.h"
 #include "SIFixSGPRCopies.h"
+#include "SILoadStoreOptimizer.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIMachineScheduler.h"
 #include "TargetInfo/AMDGPUTargetInfo.h"
@@ -409,7 +410,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSIShrinkInstructionsPass(*PR);
   initializeSIOptimizeExecMaskingPreRAPass(*PR);
   initializeSIOptimizeVGPRLiveRangePass(*PR);
-  initializeSILoadStoreOptimizerPass(*PR);
+  initializeSILoadStoreOptimizerLegacyPass(*PR);
   initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
   initializeAMDGPUAlwaysInlinePass(*PR);
   initializeAMDGPUSwLowerLDSLegacyPass(*PR);
@@ -1261,7 +1262,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
   addPass(&SIFoldOperandsID);
   if (EnableDPPCombine)
     addPass(&GCNDPPCombineID);
-  addPass(&SILoadStoreOptimizerID);
+  addPass(&SILoadStoreOptimizerLegacyID);
   if (isPassEnabled(EnableSDWAPeephole)) {
     addPass(&SIPeepholeSDWAID);
     addPass(&EarlyMachineLICMID);
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index ddce80b2ae129e..1ec40d5f1b3817 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -57,6 +57,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "SILoadStoreOptimizer.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -104,7 +105,7 @@ struct AddressRegs {
 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
 const unsigned MaxAddressRegs = 12 + 1 + 1;
 
-class SILoadStoreOptimizer : public MachineFunctionPass {
+class SILoadStoreOptimizer {
   struct CombineInfo {
     MachineBasicBlock::iterator I;
     unsigned EltSize;
@@ -142,7 +143,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
         // with vectors of pointers.
         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
-         return false;
+          return false;
         }
       }
       return true;
@@ -155,7 +156,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
         if (AddrOp->isImm())
           continue;
 
-        // Don't try to merge addresses that aren't either immediates or registers.
+        // Don't try to merge addresses that aren't either immediates or
+        // registers.
         // TODO: Should be possible to merge FrameIndexes and maybe some other
         // non-register
         if (!AddrOp->isReg())
@@ -178,7 +180,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
 
     // Compare by pointer order.
-    bool operator<(const CombineInfo& Other) const {
+    bool operator<(const CombineInfo &Other) const {
       return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
     }
   };
@@ -209,8 +211,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
                            const DenseSet<Register> &ARegUses,
                            const MachineInstr &A, const MachineInstr &B) const;
-  static bool dmasksCanBeCombined(const CombineInfo &CI,
-                                  const SIInstrInfo &TII,
+  static bool dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII,
                                   const CombineInfo &Paired);
   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
                                    CombineInfo &Paired, bool Modify = false);
@@ -274,15 +275,17 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
   std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
-  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
+  void processBaseWithConstOffset(const MachineOperand &Base,
+                                  MemAddress &Addr) const;
   /// Promotes constant offset to the immediate by adjusting the base. It
   /// tries to use a base from the nearby instructions that allows it to have
   /// a 13bit constant offset which gets promoted to the immediate.
-  bool promoteConstantOffsetToImm(MachineInstr &CI,
-                                  MemInfoMap &Visited,
-                                  SmallPtrSet<MachineInstr *, 4> &Promoted) const;
-  void addInstToMergeableList(const CombineInfo &CI,
-                  std::list<std::list<CombineInfo> > &MergeableInsts) const;
+  bool
+  promoteConstantOffsetToImm(MachineInstr &CI, MemInfoMap &Visited,
+                             SmallPtrSet<MachineInstr *, 4> &Promoted) const;
+  void addInstToMergeableList(
+      const CombineInfo &CI,
+      std::list<std::list<CombineInfo>> &MergeableInsts) const;
 
   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
@@ -295,16 +298,21 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
   static InstClassEnum getCommonInstClass(const CombineInfo &CI,
                                           const CombineInfo &Paired);
 
+  bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
+                                     bool &OptimizeListAgain);
+  bool optimizeBlock(std::list<std::list<CombineInfo>> &MergeableInsts);
+
 public:
-  static char ID;
+  SILoadStoreOptimizer(AliasAnalysis *AA) : AA(AA) {}
 
-  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
-    initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
-  }
+  bool run(MachineFunction &MF);
+};
 
-  bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
-                                     bool &OptimizeListAgain);
-  bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
+class SILoadStoreOptimizerLegacy : public MachineFunctionPass {
+public:
+  static char ID;
+
+  SILoadStoreOptimizerLegacy() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -318,8 +326,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
   }
 
   MachineFunctionProperties getRequiredProperties() const override {
-    return MachineFunctionProperties()
-      .set(MachineFunctionProperties::Property::IsSSA);
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
   }
 };
 
@@ -807,14 +815,13 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
 
   switch (InstClass) {
   case DS_READ:
-   EltSize =
-          (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
-                                                                          : 4;
-   break;
-  case DS_WRITE:
     EltSize =
-          (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
-                                                                            : 4;
+        (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
+    break;
+  case DS_WRITE:
+    EltSize = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9)
+                  ? 8
+                  : 4;
     break;
   case S_BUFFER_LOAD_IMM:
   case S_BUFFER_LOAD_SGPR_IMM:
@@ -882,18 +889,18 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
 
 } // end anonymous namespace.
 
-INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
                       "SI Load Store Optimizer", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
-                    false, false)
+INITIALIZE_PASS_END(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
+                    "SI Load Store Optimizer", false, false)
 
-char SILoadStoreOptimizer::ID = 0;
+char SILoadStoreOptimizerLegacy::ID = 0;
 
-char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
+char &llvm::SILoadStoreOptimizerLegacyID = SILoadStoreOptimizerLegacy::ID;
 
-FunctionPass *llvm::createSILoadStoreOptimizerPass() {
-  return new SILoadStoreOptimizer();
+FunctionPass *llvm::createSILoadStoreOptimizerLegacyPass() {
+  return new SILoadStoreOptimizerLegacy();
 }
 
 static void addDefsUsesToList(const MachineInstr &MI,
@@ -963,9 +970,9 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
     return false;
 
   // Check other optional immediate operands for equality.
-  unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
+  unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol,  AMDGPU::OpName::d16,
                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
-                                AMDGPU::OpName::r128, AMDGPU::OpName::a16};
+                                AMDGPU::OpName::r128,  AMDGPU::OpName::a16};
 
   for (auto op : OperandsToMatch) {
     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
@@ -991,8 +998,8 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
 }
 
 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
-                                       unsigned ComponentCount,
-                                       const GCNSubtarget &STI) {
+                                             unsigned ComponentCount,
+                                             const GCNSubtarget &STI) {
   if (ComponentCount > 4)
     return 0;
 
@@ -1060,7 +1067,8 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
     if (Info0->BitsPerComp != 32)
       return false;
 
-    if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
+    if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) ==
+        0)
       return false;
   }
 
@@ -1072,7 +1080,7 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
   // Handle all non-DS instructions.
   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
     if (EltOffset0 + CI.Width != EltOffset1 &&
-            EltOffset1 + Paired.Width != EltOffset0)
+        EltOffset1 + Paired.Width != EltOffset0)
       return false;
     if (CI.CPol != Paired.CPol)
       return false;
@@ -1554,12 +1562,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
 
   MachineInstr *New =
-    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
-        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
-        .addImm(MergedOffset) // offset
-        .addImm(CI.CPol)      // cpol
-        .addImm(0)            // swz
-        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
+      MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
+          .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
+          .addImm(MergedOffset) // offset
+          .addImm(CI.CPol)      // cpol
+          .addImm(0)            // swz
+          .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
 
@@ -1644,9 +1652,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
-          .addImm(JoinedFormat)                     // format
-          .addImm(CI.CPol)                          // cpol
-          .addImm(0)                                // swz
+          .addImm(JoinedFormat)                       // format
+          .addImm(CI.CPol)                            // cpol
+          .addImm(0)                                  // swz
           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
   CI.I->eraseFromParent();
@@ -1671,10 +1679,10 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
     MIB.add(*SAddr);
 
   MachineInstr *New =
-    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
-       .addImm(std::min(CI.Offset, Paired.Offset))
-       .addImm(CI.CPol)
-       .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
+      MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
+          .addImm(std::min(CI.Offset, Paired.Offset))
+          .addImm(CI.CPol)
+          .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
   copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
 
@@ -1701,10 +1709,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
     MIB.add(*SAddr);
 
-  MachineInstr *New =
-    MIB.addImm(std::min(CI.Offset, Paired.Offset))
-       .addImm(CI.CPol)
-       .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
+  MachineInstr *New = MIB.addImm(std::min(CI.Offset, Paired.Offset))
+                          .addImm(CI.CPol)
+                          .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1887,11 +1894,16 @@ SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
   unsigned Idx1;
 
   static const unsigned Idxs[5][4] = {
-      {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
-      {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
-      {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
-      {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
-      {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
+      {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2,
+       AMDGPU::sub0_sub1_sub2_sub3},
+      {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3,
+       AMDGPU::sub1_sub2_sub3_sub4},
+      {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4,
+       AMDGPU::sub2_sub3_sub4_sub5},
+      {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5,
+       AMDGPU::sub3_sub4_sub5_sub6},
+      {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6,
+       AMDGPU::sub4_sub5_sub6_sub7},
   };
 
   assert(CI.Width >= 1 && CI.Width <= 4);
@@ -1954,36 +1966,35 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
   if (Regs.VAddr)
     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
 
-
   // It shouldn't be possible to get this far if the two instructions
   // don't have a single memoperand, because MachineInstr::mayAlias()
   // will return true if this is the case.
   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
 
   MachineInstr *New =
-    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
-        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
-        .addImm(std::min(CI.Offset, Paired.Offset)) // offset
-        .addImm(CI.CPol)      // cpol
-        .addImm(0)            // swz
-        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
+      MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
+          .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
+          .addImm(std::min(CI.Offset, Paired.Offset)) // offset
+          .addImm(CI.CPol)                            // cpol
+          .addImm(0)                                  // swz
+          .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
   return New;
 }
 
-MachineOperand
-SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
+MachineOperand SILoadStoreOptimizer::createRegOrImm(int32_t Val,
+                                                    MachineInstr &MI) const {
   APInt V(32, Val, true);
   if (TII->isInlineConstant(V))
     return MachineOperand::CreateImm(Val);
 
   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
   MachineInstr *Mov =
-  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
-          TII->get(AMDGPU::S_MOV_B32), Reg)
-    .addImm(Val);
+      BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+              TII->get(AMDGPU::S_MOV_B32), Reg)
+          .addImm(Val);
   (void)Mov;
   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
   return MachineOperand::CreateReg(Reg, false);
@@ -2005,9 +2016,10 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
          "Expected 32-bit Base-Register-Hi!!");
 
   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
-  MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
+  MachineOperand OffsetLo =
+      createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
   MachineOperand OffsetHi =
-    createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
+      createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
 
   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
   Register CarryReg = MRI->createVirtualRegister(CarryRC);
@@ -2016,31 +2028,31 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
   MachineInstr *LoHalf =
-    BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
-      .addReg(CarryReg, RegState::Define)
-      .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
-      .add(OffsetLo)
-      .addImm(0); // clamp bit
+      BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
+          .addReg(CarryReg, RegState::Define)
+          .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
+          .add(OffsetLo)
+          .addImm(0); // clamp bit
   (void)LoHalf;
   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
 
   MachineInstr *HiHalf =
-  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
-    .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
-    .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
-    .add(OffsetHi)
-    .addReg(CarryReg, RegState::Kill)
-    .addImm(0); // clamp bit
+      BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
+          .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
+          .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
+          .add(OffsetHi)
+          .addReg(CarryReg, RegState::Kill)
+          .addImm(0); // clamp bit
   (void)HiHalf;
   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
 
   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
   MachineInstr *FullBase =
-    BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
-      .addReg(DestSub0)
-      .addImm(AMDGPU::sub0)
-      .addReg(DestSub1)
-      .addImm(AMDGPU::sub1);
+      BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
+          .addReg(DestSub0)
+          .addImm(AMDGPU::sub0)
+          .addReg(DestSub1)
+          .addImm(AMDGPU::sub1);
   (void)FullBase;
   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
 
@@ -2083,14 +2095,14 @@ SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
 //   %Base:vreg_64 =
 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
-void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
-                                                      MemAddress &Addr) const {
+void SILoadStoreOptimizer::processBaseWithConstOffset(
+    const MachineOperand &Base, MemAddress &Addr) const {
   if (!Base.isReg())
     return;
 
   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
-  if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
-      || Def->getNumOperands() != 5)
+  if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE ||
+      Def->getNumOperands() != 5)
     return;
 
   MachineOperand BaseLo = Def->getOperand(1);
@@ -2143,8 +2155,7 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
 }
 
 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
-    MachineInstr &MI,
-    MemInfoMap &Visited,
+    MachineInstr &MI, MemInfoMap &Visited,
     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
 
   if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
@@ -2183,7 +2194,8 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
   }
 
   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
-             << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
+                    << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset
+                    << "\n\n";);
 
   // Step2: Traverse through MI's basic block and find an anchor(that has the
   // same base-registers) with the highest 13bit distance from MI's offset.
@@ -2220,9 +2232,9 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
   MachineBasicBlock::iterator MBBI = MI.getIterator();
   ++MBBI;
   const SITargetLowering *TLI =
-    static_cast<const SITargetLowering *>(STM->getTargetLowering());
+      static_cast<const SITargetLowering *>(STM->getTargetLowering());
 
-  for ( ; MBBI != E; ++MBBI) {
+  for (; MBBI != E; ++MBBI) {
     MachineInstr &MINext = *MBBI;
     // TODO: Support finding an anchor(with same base) from store addresses or
     // any other load addresses where the opcodes are different.
@@ -2231,7 +2243,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
       continue;
 
     const MachineOperand &BaseNext =
-      *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
+        *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
     MemAddress MAddrNext;
     if (!Visited.contains(&MINext)) {
       processBaseWithConstOffset(BaseNext, MAddrNext);
@@ -2263,8 +2275,8 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
   if (AnchorInst) {
     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
                AnchorInst->dump());
-    LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
-               <<  AnchorAddr.Offset << "\n\n");
+    LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: " << AnchorAddr.Offset
+                      << "\n\n");
 
     // Instead of moving up, just re-compute anchor-instruction's base address.
     Register Base = computeBase(MI, AnchorAddr);
@@ -2291,8 +2303,9 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
   return false;
 }
 
-void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
-                 std::list<std::list<CombineInfo> > &MergeableInsts) const {
+void SILoadStoreOptimizer::addInstToMergeableList(
+    const CombineInfo &CI,
+    std::list<std::list<CombineInfo>> &MergeableInsts) const {
   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
     if (AddrList.front().InstClass == CI.InstClass &&
         AddrList.front().IsAGPR == CI.IsAGPR &&
@@ -2313,14 +2326,15 @@ SILoadStoreOptimizer::collectMergeableInsts(
     std::list<std::list<CombineInfo>> &MergeableInsts) const {
   bool Modified = false;
 
-  // Sort potential mergeable instructions into lists.  One list per base address.
+  // Sort potential mergeable instructions into lists.  One list per base
+  // address.
   unsigned Order = 0;
   MachineBasicBlock::iterator BlockI = Begin;
   for (; BlockI != End; ++BlockI) {
     MachineInstr &MI = *BlockI;
 
-    // We run this before checking if an address is mergeable, because it can produce
-    // better code even if the instructions aren't mergeable.
+    // We run this before checking if an address is mergeable, because it can
+    // produce better code even if the instructions aren't mergeable.
     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
       Modified = true;
 
@@ -2369,12 +2383,13 @@ SILoadStoreOptimizer::collectMergeableInsts(
   // At this point we have lists of Mergeable instructions.
   //
   // Part 2: Sort lists by offset and then for each CombineInfo object in the
-  // list try to find an instruction that can be merged with I.  If an instruction
-  // is found, it is stored in the Paired field.  If no instructions are found, then
-  // the CombineInfo object is deleted from the list.
+  // list try to find an instruction that can be merged with I.  If an
+  // instruction is found, it is stored in the Paired field.  If no instructions
+  // are found, then the CombineInfo object is deleted from the list.
 
   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
-                                                   E = MergeableInsts.end(); I != E;) {
+                                                   E = MergeableInsts.end();
+       I != E;) {
 
     std::list<CombineInfo> &MergeList = *I;
     if (MergeList.size() <= 1) {
@@ -2388,10 +2403,9 @@ SILoadStoreOptimizer::collectMergeableInsts(
     // Sort the lists by offsets, this way mergeable instructions will be
     // adjacent to each other in the list, which will make it easier to find
     // matches.
-    MergeList.sort(
-        [] (const CombineInfo &A, const CombineInfo &B) {
-          return A.Offset < B.Offset;
-        });
+    MergeList.sort([](const CombineInfo &A, const CombineInfo &B) {
+      return A.Offset < B.Offset;
+    });
     ++I;
   }
 
@@ -2402,11 +2416,12 @@ SILoadStoreOptimizer::collectMergeableInsts(
 // the same base register. We rely on the scheduler to do the hard work of
 // clustering nearby loads, and assume these are all adjacent.
 bool SILoadStoreOptimizer::optimizeBlock(
-                       std::list<std::list<CombineInfo> > &MergeableInsts) {
+    std::list<std::list<CombineInfo>> &MergeableInsts) {
   bool Modified = false;
 
   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
-                                                   E = MergeableInsts.end(); I != E;) {
+                                                   E = MergeableInsts.end();
+       I != E;) {
     std::list<CombineInfo> &MergeList = *I;
 
     bool OptimizeListAgain = false;
@@ -2431,10 +2446,8 @@ bool SILoadStoreOptimizer::optimizeBlock(
   return Modified;
 }
 
-bool
-SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
-                                          std::list<CombineInfo> &MergeList,
-                                          bool &OptimizeListAgain) {
+bool SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
+    std::list<CombineInfo> &MergeList, bool &OptimizeListAgain) {
   if (MergeList.empty())
     return false;
 
@@ -2522,10 +2535,15 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
   return Modified;
 }
 
-bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
+bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
+  SILoadStoreOptimizer Impl(
+      &getAnalysis<AAResultsWrapperPass>().getAAResults());
+  return Impl.run(MF);
+}
 
+bool SILoadStoreOptimizer::run(MachineFunction &MF) {
   STM = &MF.getSubtarget<GCNSubtarget>();
   if (!STM->loadStoreOptEnabled())
     return false;
@@ -2534,7 +2552,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
   TRI = &TII->getRegisterInfo();
 
   MRI = &MF.getRegInfo();
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  // AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
 
@@ -2571,3 +2589,19 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
 
   return Modified;
 }
+
+PreservedAnalyses
+SILoadStoreOptimizerPass::run(MachineFunction &MF,
+                              MachineFunctionAnalysisManager &MFAM) {
+  auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
+                  .getManager();
+  AAResults &AA = FAM.getResult<AAManager>(MF.getFunction());
+  SILoadStoreOptimizer Impl(&AA);
+  bool Changed = Impl.run(MF);
+  if (!Changed) {
+    return PreservedAnalyses::all();
+  }
+  PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.h b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.h
new file mode 100644
index 00000000000000..fc1becb07d7276
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.h
@@ -0,0 +1,77 @@
+//===--- SILoadStoreOptimizer.h -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to fuse DS instructions with close by immediate offsets.
+// This will fuse operations such as
+//  ds_read_b32 v0, v2 offset:16
+//  ds_read_b32 v1, v2 offset:32
+// ==>
+//   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
+//
+// The same is done for certain SMEM and VMEM opcodes, e.g.:
+//  s_buffer_load_dword s4, s[0:3], 4
+//  s_buffer_load_dword s5, s[0:3], 8
+// ==>
+//  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
+//
+// This pass also tries to promote constant offset to the immediate by
+// adjusting the base. It tries to use a base from the nearby instructions that
+// allows it to have a 13bit constant offset and then promotes the 13bit offset
+// to the immediate.
+// E.g.
+//  s_movk_i32 s0, 0x1800
+//  v_add_co_u32_e32 v0, vcc, s0, v2
+//  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
+//
+//  s_movk_i32 s0, 0x1000
+//  v_add_co_u32_e32 v5, vcc, s0, v2
+//  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+//  global_load_dwordx2 v[5:6], v[5:6], off
+//  global_load_dwordx2 v[0:1], v[0:1], off
+// =>
+//  s_movk_i32 s0, 0x1000
+//  v_add_co_u32_e32 v5, vcc, s0, v2
+//  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+//  global_load_dwordx2 v[5:6], v[5:6], off
+//  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
+//
+// Future improvements:
+//
+// - This is currently missing stores of constants because loading
+//   the constant into the data register is placed between the stores, although
+//   this is arguably a scheduling problem.
+//
+// - Live interval recomputing seems inefficient. This currently only matches
+//   one pair, and recomputes live intervals and moves on to the next pair. It
+//   would be better to compute a list of all merges that need to occur.
+//
+// - With a list of instructions to process, we can also merge more. If a
+//   cluster of loads have offsets that are too large to fit in the 8-bit
+//   offsets, but are close enough to fit in the 8 bits, we can add to the base
+//   pointer and use the new reduced offsets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SILOADSTOREOPTIMIZER_H
+#define LLVM_LIB_TARGET_AMDGPU_SILOADSTOREOPTIMIZER_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class SILoadStoreOptimizerPass
+    : public PassInfoMixin<SILoadStoreOptimizerPass> {
+public:
+  SILoadStoreOptimizerPass() = default;
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_SILOADSTOREOPTIMIZER_H
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/load-store-opt-dlc.mir b/llvm/test/CodeGen/AMDGPU/load-store-opt-dlc.mir
index f4cdedf9cf6eb8..9295bd59621039 100644
--- a/llvm/test/CodeGen/AMDGPU/load-store-opt-dlc.mir
+++ b/llvm/test/CodeGen/AMDGPU/load-store-opt-dlc.mir
@@ -1,4 +1,5 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=si-load-store-opt -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -passes=si-load-store-opt -verify-machineinstrs -o - %s | FileCheck %s
 
 # The purpose of this test is to make sure we are combining relevant memory
 # operations correctly with/without DLC bit.
diff --git a/llvm/test/CodeGen/AMDGPU/load-store-opt-scc.mir b/llvm/test/CodeGen/AMDGPU/load-store-opt-scc.mir
index c4e131b90deb48..c0cc3e9f4edd7f 100644
--- a/llvm/test/CodeGen/AMDGPU/load-store-opt-scc.mir
+++ b/llvm/test/CodeGen/AMDGPU/load-store-opt-scc.mir
@@ -1,4 +1,5 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=si-load-store-opt -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -passes=si-load-store-opt -verify-machineinstrs -o - %s | FileCheck %s
 
 # The purpose of this test is to make sure we are combining relevant memory
 # operations correctly with/without SCC bit.