[llvm-branch-commits] [llvm] [AMDGPU] Add and plug-in DSLatencyMode flag + attr to control LDS latency (PR #205626)

Wed Jun 24 12:59:50 PDT 2026

llvmorg-github-actions[bot] wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Jeffrey Byrnes (jrbyrnes)

<details>
<summary>Changes</summary>

The latency of LDS instructions depends on runtime conditions which we cannot model accurately in the compiler.

This gives the user some control over the LDS latency. Mainly this is used to influence scheduling decisions (e.g. how many independent instructions to schedule between an LDS def and it's use, HardwareUnit modelling in the coexec scheduler). That said, existing calls in our backend to `computeInstrLatency` are replaced with getInstrLatency, which has this new handling integrated.

Ideally, we would be able to replace target independent calls of `computeInstrLatency` with this special handling -- that is the intention of https://github.com/llvm/llvm-project/pull/128925 . For now, this just implements a shim and replaces the AMDGPU backend calls.

The actual latency values and granularity of control are still being worked out, so it is possible that we'll have a future PR to change the available values `ds-latency-mode`, for now this adds the machinery and reasonable options.

---

Patch is 33.15 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/205626.diff


14 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp (+2-3) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp (+2-1) 
- (modified) llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (+10-13) 
- (modified) llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp (+2-1) 
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.cpp (+9) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+59-2) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+20) 
- (modified) llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/coexec-sched-flavor-classification.mir (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll (+4-4) 
- (added) llvm/test/CodeGen/AMDGPU/ds-latency-mode-attr.mir (+61) 
- (added) llvm/test/CodeGen/AMDGPU/ds-latency-mode-branch-cost.mir (+53) 
- (added) llvm/test/CodeGen/AMDGPU/ds-latency-mode-default-scheduler.mir (+119) 
- (added) llvm/test/CodeGen/AMDGPU/ds-latency-mode-flag.mir (+29) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
index 18b4727a8605b..d5f6b73753b1b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
@@ -94,7 +94,6 @@ void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {
   const unsigned BarrierSignalWaitLatency = BarrierSignalWaitLatencyOpt;
   SmallVector<SUnit *, 8> RegionTDM;
   SmallVector<SUnit *, 8> RegionAsync;
-  const TargetSchedModel *SchedModel = DAG->getSchedModel();
 
   for (SUnit &SU : DAG->SUnits) {
     const MachineInstr *MI = SU.getInstr();
@@ -117,9 +116,9 @@ void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {
         if (!MI->mayLoad() || MI->mayStore())
           continue;
 
+        unsigned InstrLatency = TII->getInstrLatency(*MI);
         addLatencyToEdge(PredDep, SU,
-                         SchedModel ? SchedModel->computeInstrLatency(MI, false)
-                                    : FenceLatency);
+                         InstrLatency ? InstrLatency : FenceLatency);
       }
     } else if (Op == AMDGPU::S_BARRIER_WAIT) {
       for (SDep &PredDep : SU.Preds) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index d29ee5084cc6c..0daa1e1d88693 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -183,7 +183,7 @@ unsigned CandidateHeuristics::getHWUICyclesForInst(SUnit *SU) {
 
   MachineInstr *MI = SU->getInstr();
   if (MI->mayLoadOrStore())
-    return SchedModel->computeInstrLatency(MI);
+    return SII->getInstrLatency(*MI);
 
   unsigned ReleaseAtCycle = 0;
   const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
@@ -714,6 +714,7 @@ ScheduleDAGInstrs *
 llvm::createGCNCoExecMachineScheduler(MachineSchedContext *C) {
   LLVM_DEBUG(dbgs() << "AMDGPU coexec preRA scheduler selected for "
                     << C->MF->getName() << '\n');
+
   ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
       C, std::make_unique<AMDGPUCoExecSchedStrategy>(C));
   DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 722ddb5d0c4dd..7dc285a11ad1f 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2188,7 +2188,7 @@ static unsigned getWMMAHazardInstInCategory(const MachineInstr &MI,
   bool IsLowestRateWMMA = ST.hasGFX125xLowestRateWMMA();
   unsigned Category = 0;
 
-  unsigned Latency = SchedModel.computeInstrLatency(&MI);
+  unsigned Latency = TII->getInstrLatency(MI);
   switch (Latency) {
   case 8:
     Category = IsSWMMAC ? 2 : 0;
@@ -2665,8 +2665,7 @@ int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) const {
       Register DstReg = MI.getOperand(0).getReg();
       if (DstReg == Reg)
         return false;
-      HazardDefLatency =
-          std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
+      HazardDefLatency = std::max(HazardDefLatency, TII.getInstrLatency(MI));
       return TRI.regsOverlap(DstReg, Reg);
     };
 
@@ -2742,8 +2741,7 @@ int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) const {
       if (!SIInstrInfo::isMFMA(MI))
         return false;
       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
-      HazardDefLatency =
-          std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
+      HazardDefLatency = std::max(HazardDefLatency, TII.getInstrLatency(MI));
       return TRI.regsOverlap(Reg, DstReg);
     };
 
@@ -2904,8 +2902,7 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) const {
             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
-        else if (ST.hasGFX940Insts() &&
-                 TSchedModel.computeInstrLatency(MI1) == 2)
+        else if (ST.hasGFX940Insts() && TII.getInstrLatency(*MI1) == 2)
           NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
       } else {
         switch (Opc1) {
@@ -2925,7 +2922,7 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) const {
             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
           break;
         default:
-          int NumPasses = TSchedModel.computeInstrLatency(MI1);
+          int NumPasses = TII.getInstrLatency(*MI1);
           if (ST.hasGFX940Insts()) {
             if (TII.isXDL(*MI) && !TII.isXDL(*MI1))
               break;
@@ -2982,7 +2979,7 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) const {
         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
         break;
       default:
-        int NumPasses = TSchedModel.computeInstrLatency(MI1);
+        int NumPasses = TII.getInstrLatency(*MI1);
 
         if (ST.hasGFX940Insts()) {
           NeedWaitStates =
@@ -3262,7 +3259,7 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) const {
       if (!MFMA)
         continue;
 
-      unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
+      unsigned HazardDefLatency = TII.getInstrLatency(*MFMA);
       int NumPasses = HazardDefLatency;
       int NeedWaitStates = MaxWaitStates;
 
@@ -3357,7 +3354,7 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) const {
         getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
     if (MFMA) {
       int NeedWaitStates = MaxWaitStates;
-      int NumPasses = TSchedModel.computeInstrLatency(MFMA);
+      int NumPasses = TII.getInstrLatency(*MFMA);
 
       if (SIInstrInfo::isDGEMM(MFMA->getOpcode())) {
         switch (NumPasses) {
@@ -3424,7 +3421,7 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) const {
     if (!MFMA)
       continue;
 
-    unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
+    unsigned HazardDefLatency = TII.getInstrLatency(*MFMA);
     int NeedWaitStates = MaxWaitStates;
     switch (HazardDefLatency) {
     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
@@ -3463,7 +3460,7 @@ bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) const {
   if (IsMFMAFn(*MI)) {
     int W = getWaitStatesSince(IsMFMAFn, 16);
     if (MAI)
-      return W < (int)TSchedModel.computeInstrLatency(MAI);
+      return W < (int)TII.getInstrLatency(*MAI);
   }
 
   return false;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index a4f854beaeebe..a59f6c643fca7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1999,10 +1999,11 @@ GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
                                       DenseMap<unsigned, unsigned> &ReadyCycles,
                                       const TargetSchedModel &SM) {
   unsigned ReadyCycle = CurrCycle;
+  const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
   for (auto &D : SU.Preds) {
     if (D.isAssignedRegDep()) {
       MachineInstr *DefMI = D.getSUnit()->getInstr();
-      unsigned Latency = SM.computeInstrLatency(DefMI);
+      unsigned Latency = SII->getInstrLatency(*DefMI);
       unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
       ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
     }
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 37efb3a51cb9d..c2267780a84fc 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -786,6 +786,15 @@ void GCNSubtarget::adjustSchedDependency(
     return; // This is not a data dependency anymore.
   }
 
+  // DS load/store latency is variable depending on LDS contention.
+  if (InstrInfo.isDS(*DefI) &&
+      InstrInfo.getDSLatencyMultiplier(*DefI->getMF()) != 1) {
+    // For LDS instructions, we have overrides to change default latencies.
+    unsigned Latency = InstrInfo.getInstrLatency(*DefI);
+    Dep.setLatency(Latency);
+    return;
+  }
+
   if (DefI->isBundle()) {
     const SIRegisterInfo *TRI = getRegisterInfo();
     auto Reg = Dep.getReg();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 43cdaa34cf3e3..c4e0c0d467541 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -15,6 +15,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPULaneMaskUtils.h"
+#include "AMDGPUTargetMachine.h"
 #include "GCNHazardRecognizer.h"
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
@@ -63,6 +64,17 @@ static cl::opt<bool> Fix16BitCopies(
   cl::init(true),
   cl::ReallyHidden);
 
+static cl::opt<SIInstrInfo::DSLatencyMode> DSLatency(
+    "amdgpu-ds-latency-mode", cl::desc("LDS latency mode (LDS contention)"),
+    cl::values(
+        clEnumValN(SIInstrInfo::DSLatencyMode::Fast, "fast",
+                   "Use default/pinned latency (no contention)"),
+        clEnumValN(SIInstrInfo::DSLatencyMode::Loaded, "loaded",
+                   "Use loaded latency (moderate contention, 3x latency)"),
+        clEnumValN(SIInstrInfo::DSLatencyMode::Overloaded, "overloaded",
+                   "Use overloaded latency (high contention, 5x latency)")),
+    cl::init(SIInstrInfo::DSLatencyMode::Fast), cl::Hidden);
+
 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
     : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP,
                          AMDGPU::ADJCALLSTACKDOWN),
@@ -10834,12 +10846,24 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
     unsigned Lat = 0, Count = 0;
     for (++I; I != E && I->isBundledWithPred(); ++I) {
       ++Count;
-      Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
+      Lat = std::max(Lat, getInstrLatency(*I));
     }
     return Lat + Count - 1;
   }
 
-  return SchedModel.computeInstrLatency(&MI);
+  return getInstrLatency(MI);
+}
+
+unsigned SIInstrInfo::getInstrLatency(const MachineInstr &MI) const {
+  if (SchedModel.hasInstrSchedModel()) {
+    unsigned Latency = SchedModel.computeInstrLatency(&MI);
+    if (isDS(MI)) {
+      Latency *= getDSLatencyMultiplier(*MI.getMF());
+    }
+    return Latency;
+  }
+
+  return 0;
 }
 
 const MachineOperand &
@@ -11528,3 +11552,36 @@ bool SIInstrInfo::isXDL(const MachineInstr &MI) const {
 
   return AMDGPU::getMAIIsGFX940XDL(Opcode);
 }
+
+unsigned SIInstrInfo::getDSLatencyMultiplier(const MachineFunction &MF) {
+  const Function &F = MF.getFunction();
+
+  // Priority selection goes to the attribute
+  Attribute A = F.getFnAttribute("amdgpu-ds-latency-mode");
+  if (A.isValid()) {
+    StringRef Val = A.getValueAsString();
+    if (Val == "fast")
+      return 1;
+    if (Val == "loaded")
+      return 3;
+    if (Val == "overloaded")
+      return 5;
+  }
+
+  // If using coexec scheduler, default to "loaded" mode unless overridden
+  // by the command line option.
+  if (DSLatency.getNumOccurrences() == 0 &&
+      AMDGPU::getSchedStrategy(F) == "coexec")
+    return 3;
+
+  switch (DSLatency) {
+  case DSLatencyMode::Fast:
+    return 1; // Use default scheduling model latency
+  case DSLatencyMode::Loaded:
+    return 3;
+  case DSLatencyMode::Overloaded:
+    return 5;
+  }
+
+  return 1;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 1d67c8664ff44..2359a7622b106 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1197,6 +1197,21 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     return Opcode == AMDGPU::SCHED_GROUP_BARRIER || Opcode == AMDGPU::IGLP_OPT;
   }
 
+  /// DS latency modes. The latency of DS load/store instructions
+  /// is variable depending on LDS contention.
+  enum class DSLatencyMode {
+    Fast,      ///< Use default/pinned latency (no contention)
+    Loaded,    ///< Use loaded latency (moderate contention, 3x latency)
+    Overloaded ///< Use overloaded latency (high contention, 5x latency)
+  };
+
+  /// \p returns the DS instruction latency multiplier based on the selected
+  /// DSLatencyMode. \p returns 1 if the default
+  /// scheduling model latency should be used (fast mode).
+  /// Checks the function attribute first, then if using coexec scheduler
+  /// defaults to "loaded", then falls back to the global command line option.
+  static unsigned getDSLatencyMultiplier(const MachineFunction &MF);
+
   static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) {
     switch (Opcode) {
     case AMDGPU::S_WAITCNT_soft:
@@ -1738,10 +1753,15 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
                                       LiveIntervals *LIS = nullptr,
                                       VirtRegMap *VRM = nullptr) const override;
 
+  // Silence a hidden overloaded virtual function warning.
+  using TargetInstrInfo::getInstrLatency;
+
   unsigned getInstrLatency(const InstrItineraryData *ItinData,
                            const MachineInstr &MI,
                            unsigned *PredCost = nullptr) const override;
 
+  unsigned getInstrLatency(const MachineInstr &MI) const;
+
   const MachineOperand &getCalleeOperand(const MachineInstr &MI) const override;
 
   ValueUniformity getValueUniformity(const MachineInstr &MI) const final;
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index a496c9a4daa71..19097118df3f0 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -421,14 +421,14 @@ class BranchWeightCostModel {
     BranchProb = Head.getSuccProbability(FromIt);
     if (BranchProb.isUnknown())
       BranchProb = BranchProbability::getZero();
-    BranchTakenCost = SchedModel.computeInstrLatency(&Branch);
+    BranchTakenCost = TII.getInstrLatency(Branch);
   }
 
   bool isProfitable(const MachineInstr &MI) {
     if (TII.isWaitcnt(MI.getOpcode()))
       return false;
 
-    ThenCyclesCost += SchedModel.computeInstrLatency(&MI);
+    ThenCyclesCost += TII.getInstrLatency(MI);
 
     // Consider `P = N/D` to be the probability of execz being false (skipping
     // the then-block) The transformation is profitable if always executing the
diff --git a/llvm/test/CodeGen/AMDGPU/coexec-sched-flavor-classification.mir b/llvm/test/CodeGen/AMDGPU/coexec-sched-flavor-classification.mir
index 82dd1d8748675..497558cab6e42 100644
--- a/llvm/test/CodeGen/AMDGPU/coexec-sched-flavor-classification.mir
+++ b/llvm/test/CodeGen/AMDGPU/coexec-sched-flavor-classification.mir
@@ -5,7 +5,7 @@
 # CHECK-DAG: VALU(1c): 4 cycles, 4 instrs
 # CHECK-DAG: TRANS: 2 cycles, 2 instrs
 # CHECK-DAG: VMEM: 3200 cycles, 10 instrs
-# CHECK-DAG: DS: 80 cycles, 4 instrs
+# CHECK-DAG: DS: 240 cycles, 4 instrs
 # CHECK-DAG: DMA: 640 cycles, 2 instrs
 
 --- |
diff --git a/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll b/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll
index b225d8f18805a..61bfd9f0db77e 100644
--- a/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll
+++ b/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll
@@ -311,8 +311,6 @@ define amdgpu_kernel void @ds_wmma_permute(ptr addrspace(3) %base, ptr addrspace
 ; COEXEC-NEXT:    ds_load_tr16_b128 v[112:115], v124 offset:640
 ; COEXEC-NEXT:    ds_load_tr16_b128 v[116:119], v124 offset:704
 ; COEXEC-NEXT:    ds_load_tr16_b128 v[120:123], v124 offset:896
-; COEXEC-NEXT:    s_wait_dscnt 0x13
-; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
 ; COEXEC-NEXT:    ds_load_tr16_b128 v[124:127], v124 offset:960
 ; COEXEC-NEXT:    ds_load_tr16_b128 v[128:131], v156 offset:128
 ; COEXEC-NEXT:    ds_load_tr16_b128 v[132:135], v156 offset:192
@@ -320,10 +318,12 @@ define amdgpu_kernel void @ds_wmma_permute(ptr addrspace(3) %base, ptr addrspace
 ; COEXEC-NEXT:    ds_load_tr16_b128 v[140:143], v156 offset:448
 ; COEXEC-NEXT:    ds_load_tr16_b128 v[144:147], v156 offset:640
 ; COEXEC-NEXT:    ds_load_tr16_b128 v[148:151], v156 offset:704
-; COEXEC-NEXT:    s_wait_dscnt 0x16
-; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
 ; COEXEC-NEXT:    ds_load_tr16_b128 v[152:155], v156 offset:896
 ; COEXEC-NEXT:    ds_load_tr16_b128 v[156:159], v156 offset:960
+; COEXEC-NEXT:    s_wait_dscnt 0x1c
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
+; COEXEC-NEXT:    s_wait_dscnt 0x18
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
 ; COEXEC-NEXT:    s_wait_dscnt 0x14
 ; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
 ; COEXEC-NEXT:    s_wait_dscnt 0x10
diff --git a/llvm/test/CodeGen/AMDGPU/ds-latency-mode-attr.mir b/llvm/test/CodeGen/AMDGPU/ds-latency-mode-attr.mir
new file mode 100644
index 0000000000000..59bc0c0554c9c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ds-latency-mode-attr.mir
@@ -0,0 +1,61 @@
+# REQUIRES: asserts
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=machine-scheduler -amdgpu-sched-strategy=coexec -debug-only=machine-scheduler %s -filetype=null 2>&1 | FileCheck %s
+
+
+# CHECK: Region: test_ds_latency_fast
+# CHECK: DS: 80 cycles, 4 instrs
+
+# CHECK: Region: test_ds_latency_loaded
+# CHECK: DS: 240 cycles, 4 instrs
+
+# CHECK: Region: test_ds_latency_overloaded
+# CHECK: DS: 400 cycles, 4 instrs
+
+--- |
+  define void @test_ds_latency_fast() "amdgpu-waves-per-eu"="1,1" "amdgpu-ds-latency-mode"="fast" { ret void }
+  define void @test_ds_latency_loaded() "amdgpu-waves-per-eu"="1,1" "amdgpu-ds-latency-mode"="loaded" { ret void }
+  define void @test_ds_latency_overloaded() "amdgpu-waves-per-eu"="1,1" "amdgpu-ds-latency-mode"="overloaded" { ret void }
+
+...
+
+---
+name: test_ds_latency_fast
+tracksRegLiveness: true
+body: |
+  bb.0:
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vreg_64_align2 = IMPLICIT_DEF
+    %2:vgpr_32 = DS_READ_B32 %0, 0, 0, implicit $m0, implicit $exec
+    %3:vreg_64_align2 = DS_READ_B64 %0, 0, 0, implicit $m0, implicit $exec
+    DS_WRITE_B32 %0, %2, 0, 0, implicit $m0, implicit $exec
+    DS_WRITE_B64 %0, %1, 0, 0, implicit $m0, implicit $exec
+    S_ENDPGM 0, implicit %2, implicit %3
+...
+
+---
+name: test_ds_latency_loaded
+tracksRegLiveness: true
+body: |
+  bb.0:
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vreg_64_align2 = IMPLICIT_DEF
+    %2:vgpr_32 = DS_READ_B32 %0, 0, 0, implicit $m0, implicit $exec
+    %3:vreg_64_align2 = DS_READ_B64 %0, 0, 0, implicit $m0, implicit $exec
+    DS_WRITE_B32 %0, %2, 0, 0, implicit $m0, implicit $exec
+    DS_WRITE_B64 %0, %1, 0, 0, implicit $m0, implicit $exec
+    S_ENDPGM 0, implicit %2, implicit %3
+...
+
+---
+name: test_ds_latency_overloaded
+tracksRegLiveness: true
+body: |
+  bb.0:
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vreg_64_align2 = IMPLICIT_DEF
+    %2:vgpr_32 = DS_READ_B32 %0, 0, 0, implicit $m0, implicit $exec
+    %3:vreg_64_align2 = DS_READ_B64 %0, 0, 0, implicit $m0, implicit $exec
+    DS_WRITE_B32 %0, %2, 0, 0, implicit $m0, implicit $exec
+    DS_WRITE_B64 %0, %1, 0, 0, implicit $m0, implicit $exec
+    S_ENDPGM 0, implicit %2, implicit %3
+...
diff --git a/llvm/test/CodeGen/AMDGPU/ds-latency-mode-branch-cost.mir b/llvm/test/CodeGen/AMDGPU/ds-latency-mode-branch-cost.mir
new file mode 100644
index 0000000000000..95bf9df77965a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ds-latency-mode-branch-cost.mir
@@ -0,0 +1,53 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass si-pre-emit-peephole -amdgpu-ds-latency-mode=fast %s -o - | FileCheck -check-prefix=FAST %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass si-pre-emit-peephole -amdgpu-ds-latency-mode=overloaded %s -o - | FileCheck -check-prefix=OVERLOADED %s
+
+# Test that ds-latency-mode affects the branch-over-removal cost model in
+# SIPreEmitPeephole. With higher DS latency, the cost of the then-block
+# is higher, making branch removal less profitable.
+
+---
+name: skip_execz_ds_multi
+body: |
+  ; FAST-LABEL: name: skip_execz_ds_multi
+  ; FAST: bb.0:
+  ; FAST-NEXT:   successors: %bb.1(0x78000000)
+  ; FAST-NEXT: {{  $}}
+  ; FAST-NEXT: bb.1:
+  ; FAST-NEXT:   successors: %bb.2(0x80000000)
+  ; FAST-NEXT: {{  $}}
+  ; FAST-NEXT:   $vgpr0 = V_...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/205626