[libcxx] [flang] [clang] [compiler-rt] [lld] [clang-tools-extra] [llvm] [lldb] [AMDGPU] Use alias info to relax waitcounts for LDS DMA (PR #74537)

Tue Dec 12 02:05:38 PST 2023

https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/74537

>From 7e382620cdc5999c645ed0746f242595f0294c58 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Mon, 4 Dec 2023 16:11:53 -0800
Subject: [PATCH 1/7] [AMDGPU] Use alias info to relax waitcounts for LDS DMA

LDA DMA loads increase VMCNT and a load from the LDS stored must
wait on this counter to only read memory after it is written.
Wait count insertion pass does not track memory dependencies, it
tracks register dependencies. To model the LDS dependency a
psuedo register is used in the scoreboard, acting like if LDS DMA
writes it and LDS load reads it.

This patch adds 8 more pseudo registers to use for independent LDS
locations if we can prove they are disjoint using alias analysis.

Fixes: SWDEV-433427
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   |  16 +-
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp |  73 +++++++++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp      |   4 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h        |   8 +
 llvm/lib/Target/AMDGPU/lds-dma-waits.ll     | 154 ++++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll    |   2 +
 6 files changed, 241 insertions(+), 16 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/lds-dma-waits.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7ef..2e079404b087fa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1128,11 +1128,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                     MachineMemOperand::MOStore |
                     MachineMemOperand::MODereferenceable;
 
-      // XXX - Should this be volatile without known ordering?
-      Info.flags |= MachineMemOperand::MOVolatile;
-
       switch (IntrID) {
       default:
+        // XXX - Should this be volatile without known ordering?
+        Info.flags |= MachineMemOperand::MOVolatile;
         break;
       case Intrinsic::amdgcn_raw_buffer_load_lds:
       case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
@@ -1140,6 +1139,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
         unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
         Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+        Info.ptrVal = CI.getArgOperand(1);
         return true;
       }
       }
@@ -1268,8 +1268,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.opc = ISD::INTRINSIC_VOID;
     unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
     Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
-    Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
-                  MachineMemOperand::MOVolatile;
+    Info.ptrVal = CI.getArgOperand(1);
+    Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
     return true;
   }
   case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
@@ -9084,7 +9084,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 
     MachinePointerInfo StorePtrI = LoadPtrI;
-    StorePtrI.V = nullptr;
+    LoadPtrI.V = UndefValue::get(
+        PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
+    LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
     StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 
     auto F = LoadMMO->getFlags() &
@@ -9162,6 +9164,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
     LoadPtrI.Offset = Op->getConstantOperandVal(5);
     MachinePointerInfo StorePtrI = LoadPtrI;
+    LoadPtrI.V = UndefValue::get(
+        PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
     LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
     StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
     auto F = LoadMMO->getFlags() &
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7..50ad22130e939e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -31,6 +31,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/InitializePasses.h"
@@ -121,8 +122,13 @@ enum RegisterMapping {
   SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
   AGPR_OFFSET = 256,      // Maximum programmable ArchVGPRs across all targets.
   SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
-  NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
-  EXTRA_VGPR_LDS = 0,     // An artificial register to track LDS writes.
+  NUM_EXTRA_VGPRS = 9,    // Reserved slots for DS.
+  // Artificial register slots to track LDS writes into specific LDS locations
+  // if a location is known. When slots are exhausted or location is
+  // unknown use the first slot. The first slot is also always updated in
+  // addition to known location's slot to properly generate waits if dependent
+  // instruction's location is unknown.
+  EXTRA_VGPR_LDS = 0,
   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
 };
 
@@ -292,6 +298,10 @@ class WaitcntBrackets {
     VgprVmemTypes[GprNo] = 0;
   }
 
+  const SmallVectorImpl<const MachineInstr *>& getLDSDMAStores() const {
+    return LDSDMAStores;
+  }
+
   void print(raw_ostream &);
   void dump() { print(dbgs()); }
 
@@ -354,6 +364,9 @@ class WaitcntBrackets {
   // Bitmask of the VmemTypes of VMEM instructions that might have a pending
   // write to each vgpr.
   unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
+  // Store representative LDS DMA operations. The only useful info here is
+  // alias info. One store is kept per unique AAInfo.
+  SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
 };
 
 class SIInsertWaitcnts : public MachineFunctionPass {
@@ -369,6 +382,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
   DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
   MachineLoopInfo *MLI;
   MachinePostDominatorTree *PDT;
+  AliasAnalysis *AA = nullptr;
 
   struct BlockInfo {
     std::unique_ptr<WaitcntBrackets> Incoming;
@@ -411,6 +425,8 @@ class SIInsertWaitcnts : public MachineFunctionPass {
     AU.setPreservesCFG();
     AU.addRequired<MachineLoopInfo>();
     AU.addRequired<MachinePostDominatorTree>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -452,7 +468,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
   // FLAT instruction.
   WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
     assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
-    if (!ST->hasVscnt())
+    if (!ST->hasVscnt() || SIInstrInfo::isLDSDMA(Inst))
       return VMEM_ACCESS;
     if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) {
       // FLAT and SCRATCH instructions may access scratch. Other VMEM
@@ -547,8 +563,7 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI,
 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS written
 // can be accessed. A load from LDS to VMEM does not need a wait.
 static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
-  return SIInstrInfo::isVALU(MI) &&
-         (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)) &&
+  return SIInstrInfo::isLDSDMA(MI) &&
          MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
 }
 
@@ -704,7 +719,36 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
       }
     }
     if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) {
-      setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
+      unsigned Slot = 0;
+      for (const auto *MemOp : Inst.memoperands()) {
+        if (MemOp->isStore() &&
+            MemOp->getAddrSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+          // Comparing just AA info does not guarantee memoperands are equal
+          // in general, but this is so for LDS DMA on practice.
+          auto AAI = MemOp->getAAInfo();
+          if (!AAI)
+            break;
+          auto I = llvm::find_if(LDSDMAStores, [&AAI](const MachineInstr *I) {
+            for (const auto *MemOp : I->memoperands()) {
+              if (MemOp->isStore())
+                return AAI == MemOp->getAAInfo();
+            }
+            return false;
+          });
+          if (I != LDSDMAStores.end()) {
+            Slot = I - LDSDMAStores.begin() + 1;
+            break;
+          }
+          if (LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
+            break;
+          LDSDMAStores.push_back(&Inst);
+          Slot = LDSDMAStores.size();
+          break;
+        }
+      }
+      setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore);
+      if (Slot)
+        setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
     }
   }
 }
@@ -1180,9 +1224,21 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
         // No need to wait before load from VMEM to LDS.
         if (mayWriteLDSThroughDMA(MI))
           continue;
-        unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+
         // VM_CNT is only relevant to vgpr or LDS.
-        ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
+        unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+        bool FoundAliasingStore = false;
+        if (Ptr && Memop->getAAInfo()) {
+          const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
+          for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
+            if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
+              FoundAliasingStore = true;
+              ScoreBrackets.determineWait(VM_CNT, RegNo + I + 1, Wait);
+            }
+          }
+        }
+        if (!FoundAliasingStore)
+          ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
         if (Memop->isStore()) {
           ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
         }
@@ -1818,6 +1874,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   MLI = &getAnalysis<MachineLoopInfo>();
   PDT = &getAnalysis<MachinePostDominatorTree>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
   for (auto T : inst_counter_types())
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index b5b456d6912544..17befd6c0e3462 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3654,8 +3654,8 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
   // underlying address space, even if it was lowered to a different one,
   // e.g. private accesses lowered to use MUBUF instructions on a scratch
   // buffer.
-  if (isDS(MIa)) {
-    if (isDS(MIb))
+  if (isDS(MIa) || isLDSDMA(MIa)) {
+    if (isDS(MIb) || isLDSDMA(MIb))
       return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
     return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e388b5550cb104..456d8d835986fd 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -547,6 +547,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     return get(Opcode).TSFlags & SIInstrFlags::DS;
   }
 
+  static bool isLDSDMA(const MachineInstr &MI) {
+    return isVALU(MI) && (isMUBUF(MI) || isFLAT(MI));
+  }
+
+  bool isLDSDMA(uint16_t Opcode) {
+    return isVALU(Opcode) && (isMUBUF(Opcode) || isFLAT(Opcode));
+  }
+
   static bool isGWS(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::GWS;
   }
diff --git a/llvm/lib/Target/AMDGPU/lds-dma-waits.ll b/llvm/lib/Target/AMDGPU/lds-dma-waits.ll
new file mode 100644
index 00000000000000..31155290216d02
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/lds-dma-waits.ll
@@ -0,0 +1,154 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixeses=GCN,GFX9
+; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s --check-prefixeses=GCN,GFX10
+
+ at lds.0 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.1 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.2 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.3 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.4 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.5 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.6 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.7 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.8 = internal addrspace(3) global [64 x float] poison, align 16
+ at lds.9 = internal addrspace(3) global [64 x float] poison, align 16
+
+declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
+declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
+
+; GCN-LABEL: {{^}}buffer_load_lds_dword_2_arrays:
+; GCN-COUNT-4: buffer_load_dword
+; GCN: s_waitcnt vmcnt(2)
+; GCN: ds_read_b32
+; GCN: s_waitcnt vmcnt(0)
+; GCN: ds_read_b32
+define amdgpu_kernel void @buffer_load_lds_dword_2_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
+main_body:
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 4, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.1, i32 4, i32 8, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.1, i32 4, i32 12, i32 0, i32 0, i32 0)
+  %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
+  %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
+  %val.0 = load float, ptr addrspace(3) %gep.0, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.1 = load float, ptr addrspace(3) %gep.1, align 4
+  %tmp.0 = insertelement <2 x float> undef, float %val.0, i32 0
+  %res = insertelement <2 x float> %tmp.0, float %val.1, i32 1
+  store <2 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; On gfx9 if there is a pending FLAT operation, and this is a VMem or LGKM
+; waitcnt and the target can report early completion, then we need to force a waitcnt 0.
+
+; GCN-LABEL: {{^}}global_load_lds_dword_2_arrays:
+; GCN-COUNT-4: global_load_dword
+; GFX9: s_waitcnt vmcnt(0)
+; GFX9-COUNT-2: ds_read_b32
+; GFX10: s_waitcnt vmcnt(2)
+; GFX10: ds_read_b32
+; GFX10: s_waitcnt vmcnt(0)
+; GFX10: ds_read_b32
+define amdgpu_kernel void @global_load_lds_dword_2_arrays(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
+main_body:
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 4, i32 0)
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 8, i32 0)
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 12, i32 0)
+  %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
+  %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
+  %val.0 = load float, ptr addrspace(3) %gep.0, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.1 = load float, ptr addrspace(3) %gep.1, align 4
+  %tmp.0 = insertelement <2 x float> undef, float %val.0, i32 0
+  %res = insertelement <2 x float> %tmp.0, float %val.1, i32 1
+  store <2 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+; There are 8 pseudo registers defined to track LDS DMA dependencies.
+; When exhausted we default to vmcnt(0).
+
+; GCN-LABEL: {{^}}buffer_load_lds_dword_10_arrays:
+; GCN-COUNT-10: buffer_load_dword
+; GCN: s_waitcnt vmcnt(8)
+; GCN: ds_read_b32
+; GCN: s_waitcnt vmcnt(7)
+; GCN: ds_read_b32
+; GCN: s_waitcnt vmcnt(6)
+; GCN: ds_read_b32
+; GCN: s_waitcnt vmcnt(5)
+; GCN: ds_read_b32
+; GCN: s_waitcnt vmcnt(4)
+; GCN: ds_read_b32
+; GCN: s_waitcnt vmcnt(3)
+; GCN: ds_read_b32
+; GCN: s_waitcnt vmcnt(2)
+; GCN-NOT: s_waitcnt vmcnt
+; GCN: ds_read_b32
+; GCN: s_waitcnt vmcnt(0)
+; GCN: ds_read_b32
+define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, ptr addrspace(1) %out) {
+main_body:
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.1, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.2, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.3, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.4, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.5, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.6, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.7, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.8, i32 4, i32 0, i32 0, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.9, i32 4, i32 0, i32 0, i32 0, i32 0)
+  %gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
+  %gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
+  %gep.2 = getelementptr float, ptr addrspace(3) @lds.2, i32 %i2
+  %gep.3 = getelementptr float, ptr addrspace(3) @lds.3, i32 %i2
+  %gep.4 = getelementptr float, ptr addrspace(3) @lds.4, i32 %i2
+  %gep.5 = getelementptr float, ptr addrspace(3) @lds.5, i32 %i2
+  %gep.6 = getelementptr float, ptr addrspace(3) @lds.6, i32 %i2
+  %gep.7 = getelementptr float, ptr addrspace(3) @lds.7, i32 %i2
+  %gep.8 = getelementptr float, ptr addrspace(3) @lds.8, i32 %i2
+  %gep.9 = getelementptr float, ptr addrspace(3) @lds.9, i32 %i2
+  %val.0 = load float, ptr addrspace(3) %gep.0, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.1 = load float, ptr addrspace(3) %gep.1, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.2 = load float, ptr addrspace(3) %gep.2, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.3 = load float, ptr addrspace(3) %gep.3, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.4 = load float, ptr addrspace(3) %gep.4, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.5 = load float, ptr addrspace(3) %gep.5, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.6 = load float, ptr addrspace(3) %gep.6, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.7 = load float, ptr addrspace(3) %gep.7, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.8 = load float, ptr addrspace(3) %gep.8, align 4
+  call void @llvm.amdgcn.wave.barrier()
+  %val.9 = load float, ptr addrspace(3) %gep.9, align 4
+  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
+  %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
+  %out.gep.4 = getelementptr float, ptr addrspace(1) %out, i32 4
+  %out.gep.5 = getelementptr float, ptr addrspace(1) %out, i32 5
+  %out.gep.6 = getelementptr float, ptr addrspace(1) %out, i32 6
+  %out.gep.7 = getelementptr float, ptr addrspace(1) %out, i32 7
+  %out.gep.8 = getelementptr float, ptr addrspace(1) %out, i32 8
+  %out.gep.9 = getelementptr float, ptr addrspace(1) %out, i32 9
+  store float %val.0, ptr addrspace(1) %out
+  store float %val.1, ptr addrspace(1) %out.gep.1
+  store float %val.2, ptr addrspace(1) %out.gep.2
+  store float %val.3, ptr addrspace(1) %out.gep.3
+  store float %val.4, ptr addrspace(1) %out.gep.4
+  store float %val.5, ptr addrspace(1) %out.gep.5
+  store float %val.6, ptr addrspace(1) %out.gep.6
+  store float %val.7, ptr addrspace(1) %out.gep.7
+  store float %val.8, ptr addrspace(1) %out.gep.8
+  store float %val.9, ptr addrspace(1) %out.gep.9
+  ret void
+}
+
+declare void @llvm.amdgcn.wave.barrier()
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 7abb789019f1f0..9787b8b6c6fbef 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -130,6 +130,8 @@
 ; GCN-O0-NEXT:        MachineDominator Tree Construction
 ; GCN-O0-NEXT:        Machine Natural Loop Construction
 ; GCN-O0-NEXT:        MachinePostDominator Tree Construction
+; GCN-O0-NEXT:        Basic Alias Analysis (stateless AA impl)
+; GCN-O0-NEXT:        Function Alias Analysis Results
 ; GCN-O0-NEXT:        SI insert wait instructions
 ; GCN-O0-NEXT:        Insert required mode register values
 ; GCN-O0-NEXT:        SI Final Branch Preparation

>From bab128b4d4e5299084418805dd82f04e90448390 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Tue, 5 Dec 2023 15:31:12 -0800
Subject: [PATCH 2/7] clang-format

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 50ad22130e939e..0e0f9c33c60f1e 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -298,7 +298,7 @@ class WaitcntBrackets {
     VgprVmemTypes[GprNo] = 0;
   }
 
-  const SmallVectorImpl<const MachineInstr *>& getLDSDMAStores() const {
+  const SmallVectorImpl<const MachineInstr *> &getLDSDMAStores() const {
     return LDSDMAStores;
   }
 

>From cb78351759e3b1cc07ce8f5a973123881dbd52bd Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 6 Dec 2023 10:18:24 -0800
Subject: [PATCH 3/7] Addressed review comments

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   | 4 ++--
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 2 +-
 llvm/lib/Target/AMDGPU/lds-dma-waits.ll     | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2e079404b087fa..8fa17b3fa9ccd7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9084,7 +9084,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
 
     MachinePointerInfo StorePtrI = LoadPtrI;
-    LoadPtrI.V = UndefValue::get(
+    LoadPtrI.V = PoisonValue::get(
         PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
     LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
     StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
@@ -9164,7 +9164,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
     LoadPtrI.Offset = Op->getConstantOperandVal(5);
     MachinePointerInfo StorePtrI = LoadPtrI;
-    LoadPtrI.V = UndefValue::get(
+    LoadPtrI.V = PoisonValue::get(
         PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
     LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
     StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 0e0f9c33c60f1e..ff31b64ba0e190 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -298,7 +298,7 @@ class WaitcntBrackets {
     VgprVmemTypes[GprNo] = 0;
   }
 
-  const SmallVectorImpl<const MachineInstr *> &getLDSDMAStores() const {
+  ArrayRef<const MachineInstr *> getLDSDMAStores() const {
     return LDSDMAStores;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/lds-dma-waits.ll b/llvm/lib/Target/AMDGPU/lds-dma-waits.ll
index 31155290216d02..df8d7515c785e7 100644
--- a/llvm/lib/Target/AMDGPU/lds-dma-waits.ll
+++ b/llvm/lib/Target/AMDGPU/lds-dma-waits.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefixeses=GCN,GFX9
-; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s --check-prefixeses=GCN,GFX10
+; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixeses=GCN,GFX9
+; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck %s --check-prefixeses=GCN,GFX10
 
 @lds.0 = internal addrspace(3) global [64 x float] poison, align 16
 @lds.1 = internal addrspace(3) global [64 x float] poison, align 16

>From 21c55d215fc70c71a71d542f583cd2d5dd36e4eb Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 6 Dec 2023 11:00:53 -0800
Subject: [PATCH 4/7] Moved mayWriteLDSThroughDMA into SIInstrInfo

Changed getVmemWaitEventType() to use mayWriteLDSThroughDMA instead
of isLDSDMA as this is more sound and added a comment.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 18 ++++++++----------
 llvm/lib/Target/AMDGPU/SIInstrInfo.h        |  4 ++++
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ff31b64ba0e190..bedfc21401d1cd 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -468,7 +468,9 @@ class SIInsertWaitcnts : public MachineFunctionPass {
   // FLAT instruction.
   WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
     assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
-    if (!ST->hasVscnt() || SIInstrInfo::isLDSDMA(Inst))
+    // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
+    // these should use VM_CNT.
+    if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
       return VMEM_ACCESS;
     if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) {
       // FLAT and SCRATCH instructions may access scratch. Other VMEM
@@ -560,13 +562,6 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI,
   }
 }
 
-// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS written
-// can be accessed. A load from LDS to VMEM does not need a wait.
-static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
-  return SIInstrInfo::isLDSDMA(MI) &&
-         MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
-}
-
 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
                                     const SIRegisterInfo *TRI,
                                     const MachineRegisterInfo *MRI,
@@ -718,7 +713,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
         setRegScore(RegNo, T, CurrScore);
       }
     }
-    if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) {
+    if (Inst.mayStore() &&
+        (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
+      // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
+      // written can be accessed. A load from LDS to VMEM does not need a wait.
       unsigned Slot = 0;
       for (const auto *MemOp : Inst.memoperands()) {
         if (MemOp->isStore() &&
@@ -1222,7 +1220,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
         if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
           continue;
         // No need to wait before load from VMEM to LDS.
-        if (mayWriteLDSThroughDMA(MI))
+        if (TII->mayWriteLDSThroughDMA(MI))
           continue;
 
         // VM_CNT is only relevant to vgpr or LDS.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 456d8d835986fd..6aaa46d51aab23 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -676,6 +676,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
                                   SIInstrFlags::IsAtomicNoRet);
   }
 
+  static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
+    return isLDSDMA(MI) && MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
+  }
+
   static bool isWQM(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::WQM;
   }

>From 0587c288295d13b5bbd29b455211ceb472d03159 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 6 Dec 2023 14:43:59 -0800
Subject: [PATCH 5/7] Replaced iterator logic with a simple loop

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 39 ++++++++++-----------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index bedfc21401d1cd..8b5cfa8f27999f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -719,30 +719,27 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
       // written can be accessed. A load from LDS to VMEM does not need a wait.
       unsigned Slot = 0;
       for (const auto *MemOp : Inst.memoperands()) {
-        if (MemOp->isStore() &&
-            MemOp->getAddrSpace() == AMDGPUAS::LOCAL_ADDRESS) {
-          // Comparing just AA info does not guarantee memoperands are equal
-          // in general, but this is so for LDS DMA on practice.
-          auto AAI = MemOp->getAAInfo();
-          if (!AAI)
-            break;
-          auto I = llvm::find_if(LDSDMAStores, [&AAI](const MachineInstr *I) {
-            for (const auto *MemOp : I->memoperands()) {
-              if (MemOp->isStore())
-                return AAI == MemOp->getAAInfo();
+        if (!MemOp->isStore() ||
+            MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
+          continue;
+        // Comparing just AA info does not guarantee memoperands are equal
+        // in general, but this is so for LDS DMA on practice.
+        auto AAI = MemOp->getAAInfo();
+        if (!AAI)
+          break;
+        for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
+          for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
+            if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
+              Slot = I + 1;
+              break;
             }
-            return false;
-          });
-          if (I != LDSDMAStores.end()) {
-            Slot = I - LDSDMAStores.begin() + 1;
-            break;
           }
-          if (LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
-            break;
-          LDSDMAStores.push_back(&Inst);
-          Slot = LDSDMAStores.size();
-          break;
         }
+        if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
+          break;
+        LDSDMAStores.push_back(&Inst);
+        Slot = LDSDMAStores.size();
+        break;
       }
       setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore);
       if (Slot)

>From ac7a0704af1155312902bb85fedeee812669250c Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 7 Dec 2023 09:44:35 -0800
Subject: [PATCH 6/7] Fixed comment.

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 8b5cfa8f27999f..b2cb09e2222d48 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -723,7 +723,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
             MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
           continue;
         // Comparing just AA info does not guarantee memoperands are equal
-        // in general, but this is so for LDS DMA on practice.
+        // in general, but this is so for LDS DMA in practice.
         auto AAI = MemOp->getAAInfo();
         if (!AAI)
           break;

>From 7bab56ee7ae9f819078407b92c101ec744d27538 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Mon, 11 Dec 2023 08:14:19 -0800
Subject: [PATCH 7/7] Fix test placement and prefixes

---
 llvm/{lib/Target => test/CodeGen}/AMDGPU/lds-dma-waits.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename llvm/{lib/Target => test/CodeGen}/AMDGPU/lds-dma-waits.ll (99%)

diff --git a/llvm/lib/Target/AMDGPU/lds-dma-waits.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
similarity index 99%
rename from llvm/lib/Target/AMDGPU/lds-dma-waits.ll
rename to llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
index df8d7515c785e7..ced66603171b96 100644
--- a/llvm/lib/Target/AMDGPU/lds-dma-waits.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixeses=GCN,GFX9
-; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck %s --check-prefixeses=GCN,GFX10
+; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GCN,GFX9
+; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck %s --check-prefixes=GCN,GFX10
 
 @lds.0 = internal addrspace(3) global [64 x float] poison, align 16
 @lds.1 = internal addrspace(3) global [64 x float] poison, align 16