[flang-commits] [libcxx] [llvm] [lld] [compiler-rt] [clang-tools-extra] [clang] [libc] [lldb] [flang] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

Wed Jan 17 10:23:04 PST 2024

https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/74537

>From 7e382620cdc5999c645ed0746f242595f0294c58 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Mon, 4 Dec 2023 16:11:53 -0800
Subject: [PATCH 01/13] [AMDGPU] Use alias info to relax waitcounts for LDS DMA

LDA DMA loads increase VMCNT and a load from the LDS stored must
wait on this counter to only read memory after it is written.
Wait count insertion pass does not track memory dependencies, it
tracks register dependencies. To model the LDS dependency a
psuedo register is used in the scoreboard, acting like if LDS DMA
writes it and LDS load reads it.

This patch adds 8 more pseudo registers to use for independent LDS
locations if we can prove they are disjoint using alias analysis.

Fixes: SWDEV-433427
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   |  16 +-
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp |  73 +++++++++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp      |   4 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h        |   8 +
 llvm/lib/Target/AMDGPU/lds-dma-waits.ll     | 154 ++++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll    |   2 +
 6 files changed, 241 insertions(+), 16 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/lds-dma-waits.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7ef..2e079404b087fa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1128,11 +1128,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                     MachineMemOperand::MOStore |
                     MachineMemOperand::MODereferenceable;
 
-      // XXX - Should this be volatile without known ordering?
-      Info.flags |= MachineMemOperand::MOVolatile;
-
       switch (IntrID) {
       default:
+        // XXX - Should this be volatile without known ordering?
+        Info.flags |= MachineMemOperand::MOVolatile;
         break;
       case Intrinsic::amdgcn_raw_buffer_load_lds:
       case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
@@ -1140,6 +1139,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
         unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
         Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+        Info.ptrVal = CI.getArgOperand(1);
         return true;
       }
       }
@@ -1268,8 +1268,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.opc = ISD::INTRINSIC_VOID;
     unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
     Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
-    Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
-                  MachineMemOperand::MOVolatile;
+    Info.ptrVal = CI.getArgOperand(1);
+    Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
     return true;
   }
   case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
@@ -9084,7 +9084,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 
     MachinePointerInfo StorePtrI = LoadPtrI;
-    StorePtrI.V = nullptr;
+    LoadPtrI.V = UndefValue::get(
+        PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
+    LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
     StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 
     auto F = LoadMMO->getFlags() &
@@ -9162,6 +9164,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
     LoadPtrI.Offset = Op->getConstantOperandVal(5);
     MachinePointerInfo StorePtrI = LoadPtrI;
+    LoadPtrI.V = UndefValue::get(
+        PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
     LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
     StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
     auto F = LoadMMO->getFlags() &
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7..50ad22130e939e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -31,6 +31,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/InitializePasses.h"
@@ -121,8 +122,13 @@ enum RegisterMapping {
   SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
   AGPR_OFFSET = 256,      // Maximum programmable ArchVGPRs across all targets.
   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
-  NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
-  EXTRA_VGPR_LDS = 0,     // An artificial register to track LDS writes.
+  NUM_EXTRA_VGPRS = 9,    // Reserved slots for DS.
+  // Artificial register slots to track LDS writes into specific LDS locations
+  // if a location is known. When slots are exhausted or location is
+  // unknown use the first slot. The first slot is also always updated in
+  // addition to known location's slot to properly generate waits if dependent
+  // instruction's location is unknown.
+  EXTRA_VGPR_LDS = 0,
   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
 };
 
@@ -292,6 +298,10 @@ class WaitcntBrackets {
     VgprVmemTypes[GprNo] = 0;
   }
 
+  const SmallVectorImpl<const MachineInstr *>& getLDSDMAStores() const {
+    return LDSDMAStores;
+  }
+
   void print(raw_ostream &);
   void dump() { print(dbgs()); }
 
@@ -354,6 +364,9 @@ class WaitcntBrackets {
   // Bitmask of the VmemTypes of VMEM instructions that might have a pending
   // write to each vgpr.
   unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
+  // Store representative LDS DMA operations. The only useful info here is
+  // alias info. One store is kept per unique AAInfo.
+  SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
 };
 
 class SIInsertWaitcnts : public MachineFunctionPass {
@@ -369,6 +382,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
   DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
   MachineLoopInfo *MLI;
   MachinePostDominatorTree *PDT;
+  AliasAnalysis *AA = nullptr;
 
   struct BlockInfo {
     std::unique_ptr<WaitcntBrackets> Incoming;
@@ -411,6 +425,8 @@ class SIInsertWaitcnts : public MachineFunctionPass {
     AU.setPreservesCFG();
     AU.addRequired<MachineLoopInfo>();
     AU.addRequired<MachinePostDominatorTree>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -452,7 +468,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
   // FLAT instruction.
   WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
     assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
-    if (!ST->hasVscnt())
+    if (!ST->hasVscnt() || SIInstrInfo::isLDSDMA(Inst))
       return VMEM_ACCESS;
     if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) {
       // FLAT and SCRATCH instructions may access scratch. Other VMEM
@@ -547,8 +563,7 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI,
 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS written
 // can be accessed. A load from LDS to VMEM does not need a wait.
 static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
-  return SIInstrInfo::isVALU(MI) &&
-         (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)) &&
+  return SIInstrInfo::isLDSDMA(MI) &&
          MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
 }
 
@@ -704,7 +719,36 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
       }
     }
     if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) {
-      setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
+      unsigned Slot = 0;
+      for (const auto *MemOp : Inst.memoperands()) {
+        if (MemOp->isStore() &&
+            MemOp->getAddrSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+          // Comparing just AA info does not guarantee memoperands are equal
+          // in general, but this is so for LDS DMA on practice.
+          auto AAI = MemOp->getAAInfo();
+          if (!AAI)
+            break;
+          auto I = llvm::find_if(LDSDMAStores, [&AAI](const MachineInstr *I) {
+            for (const auto *MemOp : I->memoperands()) {
+              if (MemOp->isStore())
+                return AAI == MemOp->getAAInfo();
+            }
+            return false;
+          });
+          if (I != LDSDMAStores.end()) {
+            Slot = I - LDSDMAStores.begin() + 1;
+            break;
+          }
+          if (LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
+            break;
+          LDSDMAStores.push_back(&Inst);
+          Slot = LDSDMAStores.size();
+          break;
+        }
+      }
+      setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore);
+      if (Slot)
+        setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
     }
   }
 }
@@ -1180,9 +1224,21 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
         // No need to wait before load from VMEM to LDS.
         if (mayWriteLDSThroughDMA(MI))
           continue;
-        unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+
         // VM_CNT is only relevant to vgpr or LDS.
-        ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
+        unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+        bool FoundAliasingStore = false;
+        if (Ptr && Memop->getAAInfo()) {
+          const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
+          for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
+            if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
+              FoundAliasingStore = true;
+              ScoreBrackets.determineWait(VM_CNT, RegNo + I + 1, Wait);
+            }
+          }
+        }
+        if (!FoundAliasingStore)
+          ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
         if (Memop->isStore()) {
           ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
         }
@@ -1818,6 +1874,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   MLI = &getAnalysis<MachineLoopInfo>();
   PDT = &getAnalysis<MachinePostDominatorTree>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
   for (auto T : inst_counter_types())
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index b5b456d6912544..17befd6c0e3462 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3654,8 +3654,8 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
   // underlying address space, even if it was lowered to a different one,
   // e.g. private accesses lowered to use MUBUF instructions on a scratch
   // buffer.
-  if (isDS(MIa)) {
-    if (isDS(MIb))
+  if (isDS(MIa) || isLDSDMA(MIa)) {
+    if (isDS(MIb) || isLDSDMA(MIb))
       return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
     return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e388b5550cb104..456d8d835986fd 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -547,6 +547,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     return get(Opcode).TSFlags & SIInstrFlags::DS;
   }
 
+  static bool isLDSDMA(const MachineInstr &MI) {
+    return isVALU(MI) && (isMUBUF(MI) || isFLAT(MI));
+  }
+
+  bool isLDSDMA(uint16_t Opcode) {
+    return isVALU(Opcode) && (isMUBUF(Opcode) || isFLAT(Opcode));
+  }
+
   static bool isGWS(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::GWS;
   }
diff --git a/llvm/lib/Target/AMDGPU/lds-dma-waits.ll b/llvm/lib/Target/AMDGPU/lds-dma-waits.ll
new file mode 100644
index 00000000000000..31155290216d02
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/lds-dma-waits.ll
@@ -0,0 +1,154 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixeses=GCN,GFX9
+; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s --check-prefixeses=GCN,GFX10
+
+ at lds.0 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.1 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.2 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.3 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.4 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.5 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.6 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.7 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.8 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.9 = internal addrspace(3) global [64 x float] poison, align 16
+
+declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
+declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
+
+; GCN-LABEL: {{^}}buffer_load_lds_dword_2_arrays:
+; GCN-COUNT-4: buffer_load_dword
+; GCN: s_waitcnt vmcnt(2)
+; GCN: ds_read_b32
+; GCN: s_waitcnt vmcnt(0)
+; GCN: ds_read_b32
+define amdgpu_kernel void @buffer_load_lds_dword_2_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
+main_body:
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 4, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.1, i32 4, i32 8, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.1, i32 4, i32 12, i32 0, i32 0, i32 0)
+  %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
+  %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
+  %val.0 = load float, ptr addrspace(3) %gep.0, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.1 = load float, ptr addrspace(3) %gep.1, align 4
+  %tmp.0 = insertelement <2 x float> undef, float %val.0, i32 0
+  %res = insertelement <2 x float> %tmp.0, float %val.1, i32 1
+  store <2 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; On gfx9 if there is a pending FLAT operation, and this is a VMem or LGKM
+; waitcnt and the target can report early completion, then we need to force a waitcnt 0.
+
+; GCN-LABEL: {{^}}global_load_lds_dword_2_arrays:
+; GCN-COUNT-4: global_load_dword
+; GFX9: s_waitcnt vmcnt(0)
+; GFX9-COUNT-2: ds_read_b32
+; GFX10: s_waitcnt vmcnt(2)
+; GFX10: ds_read_b32
+; GFX10: s_waitcnt vmcnt(0)
+; GFX10: ds_read_b32
+define amdgpu_kernel void @global_load_lds_dword_2_arrays(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
+main_body:
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 4, i32 0)
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 8, i32 0)
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 12, i32 0)
+  %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
+  %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
+  %val.0 = load float, ptr addrspace(3) %gep.0, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.1 = load float, ptr addrspace(3) %gep.1, align 4
+  %tmp.0 = insertelement <2 x float> undef, float %val.0, i32 0
+  %res = insertelement <2 x float> %tmp.0, float %val.1, i32 1
+  store <2 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; There are 8 pseudo registers defined to track LDS DMA dependencies.
+; When exhausted we default to vmcnt(0).
+
+; GCN-LABEL: {{^}}buffer_load_lds_dword_10_arrays:
+; GCN-COUNT-10: buffer_load_dword
+; GCN: s_waitcnt vmcnt(8)
+; GCN: ds_read_b32
+; GCN: s_waitcnt vmcnt(7)
+; GCN: ds_read_b32
+; GCN: s_waitcnt vmcnt(6)
+; GCN: ds_read_b32
+; GCN: s_waitcnt vmcnt(5)
+; GCN: ds_read_b32
+; GCN: s_waitcnt vmcnt(4)
+; GCN: ds_read_b32
+; GCN: s_waitcnt vmcnt(3)
+; GCN: ds_read_b32
+; GCN: s_waitcnt vmcnt(2)
+; GCN-NOT: s_waitcnt vmcnt
+; GCN: ds_read_b32
+; GCN: s_waitcnt vmcnt(0)
+; GCN: ds_read_b32
+define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, ptr addrspace(1) %out) {
+main_body:
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.1, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.2, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.3, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.4, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.5, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.6, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.7, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.8, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.9, i32 4, i32 0, i32 0, i32 0, i32 0)
+  %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
+  %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
+  %gep.2 = getelementptr float, ptr addrspace(3) @lds.2, i32 %i2
+  %gep.3 = getelementptr float, ptr addrspace(3) @lds.3, i32 %i2
+  %gep.4 = getelementptr float, ptr addrspace(3) @lds.4, i32 %i2
+  %gep.5 = getelementptr float, ptr addrspace(3) @lds.5, i32 %i2
+  %gep.6 = getelementptr float, ptr addrspace(3) @lds.6, i32 %i2
+  %gep.7 = getelementptr float, ptr addrspace(3) @lds.7, i32 %i2
+  %gep.8 = getelementptr float, ptr addrspace(3) @lds.8, i32 %i2
+  %gep.9 = getelementptr float, ptr addrspace(3) @lds.9, i32 %i2
+  %val.0 = load float, ptr addrspace(3) %gep.0, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.1 = load float, ptr addrspace(3) %gep.1, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.2 = load float, ptr addrspace(3) %gep.2, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.3 = load float, ptr addrspace(3) %gep.3, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.4 = load float, ptr addrspace(3) %gep.4, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.5 = load float, ptr addrspace(3) %gep.5, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.6 = load float, ptr addrspace(3) %gep.6, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.7 = load float, ptr addrspace(3) %gep.7, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.8 = load float, ptr addrspace(3) %gep.8, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.9 = load float, ptr addrspace(3) %gep.9, align 4
+  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
+  %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
+  %out.gep.4 = getelementptr float, ptr addrspace(1) %out, i32 4
+  %out.gep.5 = getelementptr float, ptr addrspace(1) %out, i32 5
+  %out.gep.6 = getelementptr float, ptr addrspace(1) %out, i32 6
+  %out.gep.7 = getelementptr float, ptr addrspace(1) %out, i32 7
+  %out.gep.8 = getelementptr float, ptr addrspace(1) %out, i32 8
+  %out.gep.9 = getelementptr float, ptr addrspace(1) %out, i32 9
+  store float %val.0, ptr addrspace(1) %out
+  store float %val.1, ptr addrspace(1) %out.gep.1
+  store float %val.2, ptr addrspace(1) %out.gep.2
+  store float %val.3, ptr addrspace(1) %out.gep.3
+  store float %val.4, ptr addrspace(1) %out.gep.4
+  store float %val.5, ptr addrspace(1) %out.gep.5
+  store float %val.6, ptr addrspace(1) %out.gep.6
+  store float %val.7, ptr addrspace(1) %out.gep.7
+  store float %val.8, ptr addrspace(1) %out.gep.8
+  store float %val.9, ptr addrspace(1) %out.gep.9
+  ret void
+}
+
+declare void @llvm.amdgcn.wave.barrier()
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 7abb789019f1f0..9787b8b6c6fbef 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -130,6 +130,8 @@
 ; GCN-O0-NEXT:        MachineDominator Tree Construction
 ; GCN-O0-NEXT:        Machine Natural Loop Construction
 ; GCN-O0-NEXT:        MachinePostDominator Tree Construction
+; GCN-O0-NEXT:        Basic Alias Analysis (stateless AA impl)
+; GCN-O0-NEXT:        Function Alias Analysis Results
 ; GCN-O0-NEXT:        SI insert wait instructions
 ; GCN-O0-NEXT:        Insert required mode register values
 ; GCN-O0-NEXT:        SI Final Branch Preparation

>From bab128b4d4e5299084418805dd82f04e90448390 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Tue, 5 Dec 2023 15:31:12 -0800
Subject: [PATCH 02/13] clang-format

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 50ad22130e939e..0e0f9c33c60f1e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -298,7 +298,7 @@ class WaitcntBrackets {
     VgprVmemTypes[GprNo] = 0;
   }
 
-  const SmallVectorImpl<const MachineInstr *>& getLDSDMAStores() const {
+  const SmallVectorImpl<const MachineInstr *> &getLDSDMAStores() const {
     return LDSDMAStores;
   }
 

>From cb78351759e3b1cc07ce8f5a973123881dbd52bd Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 6 Dec 2023 10:18:24 -0800
Subject: [PATCH 03/13] Addressed review comments

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   | 4 ++--
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 2 +-
 llvm/lib/Target/AMDGPU/lds-dma-waits.ll     | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2e079404b087fa..8fa17b3fa9ccd7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9084,7 +9084,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 
     MachinePointerInfo StorePtrI = LoadPtrI;
-    LoadPtrI.V = UndefValue::get(
+    LoadPtrI.V = PoisonValue::get(
         PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
     LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
     StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
@@ -9164,7 +9164,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
     LoadPtrI.Offset = Op->getConstantOperandVal(5);
     MachinePointerInfo StorePtrI = LoadPtrI;
-    LoadPtrI.V = UndefValue::get(
+    LoadPtrI.V = PoisonValue::get(
         PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
     LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
     StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 0e0f9c33c60f1e..ff31b64ba0e190 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -298,7 +298,7 @@ class WaitcntBrackets {
     VgprVmemTypes[GprNo] = 0;
   }
 
-  const SmallVectorImpl<const MachineInstr *> &getLDSDMAStores() const {
+  ArrayRef<const MachineInstr *> getLDSDMAStores() const {
     return LDSDMAStores;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/lds-dma-waits.ll b/llvm/lib/Target/AMDGPU/lds-dma-waits.ll
index 31155290216d02..df8d7515c785e7 100644
--- a/llvm/lib/Target/AMDGPU/lds-dma-waits.ll
+++ b/llvm/lib/Target/AMDGPU/lds-dma-waits.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixeses=GCN,GFX9
-; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s --check-prefixeses=GCN,GFX10
+; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixeses=GCN,GFX9
+; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck %s --check-prefixeses=GCN,GFX10
 
 @lds.0 = internal addrspace(3) global [64 x float] poison, align 16
 @lds.1 = internal addrspace(3) global [64 x float] poison, align 16

>From 21c55d215fc70c71a71d542f583cd2d5dd36e4eb Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 6 Dec 2023 11:00:53 -0800
Subject: [PATCH 04/13] Moved mayWriteLDSThroughDMA into SIInstrInfo

Changed getVmemWaitEventType() to use mayWriteLDSThroughDMA instead
of isLDSDMA as this is more sound and added a comment.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 18 ++++++++----------
 llvm/lib/Target/AMDGPU/SIInstrInfo.h        |  4 ++++
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ff31b64ba0e190..bedfc21401d1cd 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -468,7 +468,9 @@ class SIInsertWaitcnts : public MachineFunctionPass {
   // FLAT instruction.
   WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
     assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
-    if (!ST->hasVscnt() || SIInstrInfo::isLDSDMA(Inst))
+    // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
+    // these should use VM_CNT.
+    if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
       return VMEM_ACCESS;
     if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) {
       // FLAT and SCRATCH instructions may access scratch. Other VMEM
@@ -560,13 +562,6 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI,
   }
 }
 
-// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS written
-// can be accessed. A load from LDS to VMEM does not need a wait.
-static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
-  return SIInstrInfo::isLDSDMA(MI) &&
-         MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
-}
-
 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
                                     const SIRegisterInfo *TRI,
                                     const MachineRegisterInfo *MRI,
@@ -718,7 +713,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
         setRegScore(RegNo, T, CurrScore);
       }
     }
-    if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) {
+    if (Inst.mayStore() &&
+        (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
+      // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
+      // written can be accessed. A load from LDS to VMEM does not need a wait.
       unsigned Slot = 0;
       for (const auto *MemOp : Inst.memoperands()) {
         if (MemOp->isStore() &&
@@ -1222,7 +1220,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
         if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
           continue;
         // No need to wait before load from VMEM to LDS.
-        if (mayWriteLDSThroughDMA(MI))
+        if (TII->mayWriteLDSThroughDMA(MI))
           continue;
 
         // VM_CNT is only relevant to vgpr or LDS.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 456d8d835986fd..6aaa46d51aab23 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -676,6 +676,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
                                   SIInstrFlags::IsAtomicNoRet);
   }
 
+  static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
+    return isLDSDMA(MI) && MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
+  }
+
   static bool isWQM(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::WQM;
   }

>From 0587c288295d13b5bbd29b455211ceb472d03159 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 6 Dec 2023 14:43:59 -0800
Subject: [PATCH 05/13] Replaced iterator logic with a simple loop

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 39 ++++++++++-----------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index bedfc21401d1cd..8b5cfa8f27999f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -719,30 +719,27 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
       // written can be accessed. A load from LDS to VMEM does not need a wait.
       unsigned Slot = 0;
       for (const auto *MemOp : Inst.memoperands()) {
-        if (MemOp->isStore() &&
-            MemOp->getAddrSpace() == AMDGPUAS::LOCAL_ADDRESS) {
-          // Comparing just AA info does not guarantee memoperands are equal
-          // in general, but this is so for LDS DMA on practice.
-          auto AAI = MemOp->getAAInfo();
-          if (!AAI)
-            break;
-          auto I = llvm::find_if(LDSDMAStores, [&AAI](const MachineInstr *I) {
-            for (const auto *MemOp : I->memoperands()) {
-              if (MemOp->isStore())
-                return AAI == MemOp->getAAInfo();
+        if (!MemOp->isStore() ||
+            MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
+          continue;
+        // Comparing just AA info does not guarantee memoperands are equal
+        // in general, but this is so for LDS DMA on practice.
+        auto AAI = MemOp->getAAInfo();
+        if (!AAI)
+          break;
+        for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
+          for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
+            if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
+              Slot = I + 1;
+              break;
             }
-            return false;
-          });
-          if (I != LDSDMAStores.end()) {
-            Slot = I - LDSDMAStores.begin() + 1;
-            break;
           }
-          if (LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
-            break;
-          LDSDMAStores.push_back(&Inst);
-          Slot = LDSDMAStores.size();
-          break;
         }
+        if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
+          break;
+        LDSDMAStores.push_back(&Inst);
+        Slot = LDSDMAStores.size();
+        break;
       }
       setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore);
       if (Slot)

>From ac7a0704af1155312902bb85fedeee812669250c Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 7 Dec 2023 09:44:35 -0800
Subject: [PATCH 06/13] Fixed comment.

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 8b5cfa8f27999f..b2cb09e2222d48 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -723,7 +723,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
             MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
           continue;
         // Comparing just AA info does not guarantee memoperands are equal
-        // in general, but this is so for LDS DMA on practice.
+        // in general, but this is so for LDS DMA in practice.
         auto AAI = MemOp->getAAInfo();
         if (!AAI)
           break;

>From 7bab56ee7ae9f819078407b92c101ec744d27538 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Mon, 11 Dec 2023 08:14:19 -0800
Subject: [PATCH 07/13] Fix test placement and prefixes

---
 llvm/{lib/Target => test/CodeGen}/AMDGPU/lds-dma-waits.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename llvm/{lib/Target => test/CodeGen}/AMDGPU/lds-dma-waits.ll (99%)

diff --git a/llvm/lib/Target/AMDGPU/lds-dma-waits.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
similarity index 99%
rename from llvm/lib/Target/AMDGPU/lds-dma-waits.ll
rename to llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
index df8d7515c785e7..ced66603171b96 100644
--- a/llvm/lib/Target/AMDGPU/lds-dma-waits.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixeses=GCN,GFX9
-; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck %s --check-prefixeses=GCN,GFX10
+; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GCN,GFX9
+; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck %s --check-prefixes=GCN,GFX10
 
 @lds.0 = internal addrspace(3) global [64 x float] poison, align 16
 @lds.1 = internal addrspace(3) global [64 x float] poison, align 16

>From 0f780607299cd6bfc47b7341f69d30b476b5daa2 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 13 Dec 2023 11:41:01 -0800
Subject: [PATCH 08/13] Bail early in areMemAccessesTriviallyDisjoint

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 57eaefd41b2622..31669764144530 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3651,6 +3651,9 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
   if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
     return false;
 
+  if (isLDSDMA(MIa) || isLDSDMA(MIb))
+    return false;
+
   // TODO: Should we check the address space from the MachineMemOperand? That
   // would allow us to distinguish objects we know don't alias based on the
   // underlying address space, even if it was lowered to a different one,

>From b86d65bd8e265dd41fb4d59fa223a9e42527683a Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 13 Dec 2023 11:43:43 -0800
Subject: [PATCH 09/13] Remove old code

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 31669764144530..d05d3c6996261f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3659,8 +3659,8 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
   // underlying address space, even if it was lowered to a different one,
   // e.g. private accesses lowered to use MUBUF instructions on a scratch
   // buffer.
-  if (isDS(MIa) || isLDSDMA(MIa)) {
-    if (isDS(MIb) || isLDSDMA(MIb))
+  if (isDS(MIa)) {
+    if (isDS(MIb))
       return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
     return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);

>From 82ec4081f5142a656c0872306ba6568c31c7bf2e Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Tue, 19 Dec 2023 11:58:07 -0800
Subject: [PATCH 10/13] Add check for presence of the alias scope info

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 2e1f939db74a60..0cfd332711fb34 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -731,7 +731,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
         // Comparing just AA info does not guarantee memoperands are equal
         // in general, but this is so for LDS DMA in practice.
         auto AAI = MemOp->getAAInfo();
-        if (!AAI)
+        // Alias scope information gives a way to definitely identify an
+        // original memory object and practically produced in the module LDS
+        // lowering pass.
+        if (!AAI || !AAI.Scope)
           break;
         for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
           for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
@@ -1229,7 +1232,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
         // VM_CNT is only relevant to vgpr or LDS.
         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
         bool FoundAliasingStore = false;
-        if (Ptr && Memop->getAAInfo()) {
+        if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
           const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
           for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
             if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {

>From 4f71d9baed75e0bdd20ab13d10b08e416d7f4a25 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Mon, 15 Jan 2024 12:22:17 -0800
Subject: [PATCH 11/13] Added more comments.

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 0be7487077bece..9a90b115ef2e07 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -733,7 +733,11 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
         auto AAI = MemOp->getAAInfo();
         // Alias scope information gives a way to definitely identify an
         // original memory object and practically produced in the module LDS
-        // lowering pass.
+        // lowering pass. If there is no scope available we will not be able
+        // to disambiguate LDS aliasing as after the module lowering all LDS
+        // is squashed into a single big object. Do not attemt to use one of
+        // the limited LDSDMAStores for something we will not be able to use
+        // anyway.
         if (!AAI || !AAI.Scope)
           break;
         for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
@@ -1232,6 +1236,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
         // VM_CNT is only relevant to vgpr or LDS.
         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
         bool FoundAliasingStore = false;
+        // Only objects with alias scope info were added to LDSDMAScopes array.
+        // In the absense of the scope info we will not be able to disambiguate
+        // aliasing here. There is no need to try searching for a corresponding
+        // store slot. This is conservatively correct because in that case we
+        // will produce a wait using the first (general) LDS DMA wait slot which
+        // will wait on all of them anyway.
         if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
           const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
           for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {

>From 0115e30ed3e558cc1be6cded053497440721352a Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 17 Jan 2024 09:35:18 -0800
Subject: [PATCH 12/13] Fixed typo in the comment

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 44c528c9e73cc4..11f8a125aca614 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -735,7 +735,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
         // original memory object and practically produced in the module LDS
         // lowering pass. If there is no scope available we will not be able
         // to disambiguate LDS aliasing as after the module lowering all LDS
-        // is squashed into a single big object. Do not attemt to use one of
+        // is squashed into a single big object. Do not attempt to use one of
         // the limited LDSDMAStores for something we will not be able to use
         // anyway.
         if (!AAI || !AAI.Scope)

>From 598804723c2602fca4eb1b14206d26d2735ad285 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 17 Jan 2024 10:22:26 -0800
Subject: [PATCH 13/13] Do not run AA at -O0

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 5 +++--
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll    | 2 --
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 11f8a125aca614..62c977fc96a89d 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -429,7 +429,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
     AU.setPreservesCFG();
     AU.addRequired<MachineLoopInfo>();
     AU.addRequired<MachinePostDominatorTree>();
-    AU.addRequired<AAResultsWrapperPass>();
+    AU.addUsedIfAvailable<AAResultsWrapperPass>();
     AU.addPreserved<AAResultsWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -1901,7 +1901,8 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   MLI = &getAnalysis<MachineLoopInfo>();
   PDT = &getAnalysis<MachinePostDominatorTree>();
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
+    AA = &AAR->getAAResults();
 
   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
   for (auto T : inst_counter_types())
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index c59b84409634d4..8b0b6263832243 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -130,8 +130,6 @@
 ; GCN-O0-NEXT:        MachineDominator Tree Construction
 ; GCN-O0-NEXT:        Machine Natural Loop Construction
 ; GCN-O0-NEXT:        MachinePostDominator Tree Construction
-; GCN-O0-NEXT:        Basic Alias Analysis (stateless AA impl)
-; GCN-O0-NEXT:        Function Alias Analysis Results
 ; GCN-O0-NEXT:        SI insert wait instructions
 ; GCN-O0-NEXT:        Insert required mode register values
 ; GCN-O0-NEXT:        SI Final Branch Preparation