[llvm] [AMDGPU] Account for existing SDWA selections (PR #123221)

Thu Jan 16 09:03:40 PST 2025

https://github.com/frederik-h created https://github.com/llvm/llvm-project/pull/123221

None

>From b29c0f218db0170f0848741a89b408bca25156c1 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 10 Jan 2025 09:59:00 -0800
Subject: [PATCH 01/10] [AMDGPU] Account for existing SDWA selections

    Change-Id: I3e1cf6042f069e8dffe9dd5b4654288111f7b1bf
---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 135 ++++++++++++++++++++--
 1 file changed, 123 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 467f042892cebe..f515ba1aac5d06 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -85,6 +85,8 @@ class SIPeepholeSDWALegacy : public MachineFunctionPass {
   }
 };
 
+using namespace AMDGPU::SDWA;
+
 class SDWAOperand {
 private:
   MachineOperand *Target; // Operand that would be used in converted instruction
@@ -102,12 +104,55 @@ class SDWAOperand {
   virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII,
                                            const GCNSubtarget &ST,
                                            SDWAOperandsMap *PotentialMatches = nullptr) = 0;
-  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
+  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII,
+                             bool Combine = false) = 0;
 
   MachineOperand *getTargetOperand() const { return Target; }
   MachineOperand *getReplacedOperand() const { return Replaced; }
   MachineInstr *getParentInst() const { return Target->getParent(); }
 
+  /// Fold a \p FoldedOp SDWA selection into an \p ExistingOp existing SDWA
+  /// selection. If the selections are compatible, \p return true and store the
+  /// SDWA selection in
+  /// \p NewOp .
+  /// For example, if we have existing BYTE_0 Sel and are attempting to fold
+  /// WORD_1 Sel: BYTE_0 Sel (WORD_1 Sel (%X)) -> BYTE_2 Sel (%X)
+  bool combineSdwaSel(SdwaSel ExistingOp, SdwaSel FoldedOp, SdwaSel &NewOp) {
+    if (ExistingOp == SdwaSel::DWORD) {
+      NewOp = FoldedOp;
+      return true;
+    }
+    if (FoldedOp == SdwaSel::DWORD) {
+      NewOp = ExistingOp;
+      return true;
+    }
+
+    if (FoldedOp != SdwaSel::WORD_0 && FoldedOp != SdwaSel::WORD_1 &&
+        FoldedOp != ExistingOp)
+      return false;
+
+    if (ExistingOp == SdwaSel::WORD_1 || ExistingOp == SdwaSel::BYTE_2 ||
+        ExistingOp == SdwaSel::BYTE_3)
+      return false;
+
+    if (ExistingOp == FoldedOp) {
+      NewOp = ExistingOp;
+      return true;
+    }
+
+    if (FoldedOp == SdwaSel::WORD_0) {
+      NewOp = ExistingOp;
+      return true;
+    }
+
+    if (FoldedOp == SdwaSel::WORD_1) {
+      NewOp = (SdwaSel)((unsigned)ExistingOp + 2);
+      return true;
+    }
+
+    return false;
+  }
+
   MachineRegisterInfo *getMRI() const {
     return &getParentInst()->getParent()->getParent()->getRegInfo();
   }
@@ -118,8 +163,6 @@ class SDWAOperand {
 #endif
 };
 
-using namespace AMDGPU::SDWA;
-
 class SDWASrcOperand : public SDWAOperand {
 private:
   SdwaSel SrcSel;
@@ -137,7 +180,8 @@ class SDWASrcOperand : public SDWAOperand {
   MachineInstr *potentialToConvert(const SIInstrInfo *TII,
                                    const GCNSubtarget &ST,
                                    SDWAOperandsMap *PotentialMatches = nullptr) override;
-  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
+  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII,
+                     bool Combine = false) override;
 
   SdwaSel getSrcSel() const { return SrcSel; }
   bool getAbs() const { return Abs; }
@@ -166,7 +210,8 @@ class SDWADstOperand : public SDWAOperand {
   MachineInstr *potentialToConvert(const SIInstrInfo *TII,
                                    const GCNSubtarget &ST,
                                    SDWAOperandsMap *PotentialMatches = nullptr) override;
-  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
+  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII,
+                     bool Combine = false) override;
 
   SdwaSel getDstSel() const { return DstSel; }
   DstUnused getDstUnused() const { return DstUn; }
@@ -186,7 +231,8 @@ class SDWADstPreserveOperand : public SDWADstOperand {
       : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
         Preserve(PreserveOp) {}
 
-  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
+  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII,
+                     bool Combine = false) override;
 
   MachineOperand *getPreservedOperand() const { return Preserve; }
 
@@ -375,7 +421,8 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
   return PotentialMO->getParent();
 }
 
-bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
+bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII,
+                                   bool Combine) {
   switch (MI.getOpcode()) {
   case AMDGPU::V_CVT_F32_FP8_sdwa:
   case AMDGPU::V_CVT_F32_BF8_sdwa:
@@ -451,7 +498,16 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
   }
   copyRegOperand(*Src, *getTargetOperand());
   if (!IsPreserveSrc) {
-    SrcSel->setImm(getSrcSel());
+    if (Combine) {
+      SdwaSel NewOp;
+      bool CanCombine =
+          combineSdwaSel((SdwaSel)SrcSel->getImm(), getSrcSel(), NewOp);
+      if (!CanCombine)
+        return false;
+      SrcSel->setImm(NewOp);
+    } else {
+      SrcSel->setImm(getSrcSel());
+    }
     SrcMods->setImm(getSrcMods(TII, Src));
   }
   getTargetOperand()->setIsKill(false);
@@ -479,7 +535,8 @@ MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII,
   return PotentialMO->getParent();
 }
 
-bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
+bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII,
+                                   bool Combine) {
   // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
 
   if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
@@ -498,7 +555,16 @@ bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
   copyRegOperand(*Operand, *getTargetOperand());
   MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
   assert(DstSel);
-  DstSel->setImm(getDstSel());
+  if (Combine) {
+    SdwaSel NewOp;
+    bool CanCombine =
+        combineSdwaSel((SdwaSel)DstSel->getImm(), getDstSel(), NewOp);
+    if (!CanCombine)
+      return false;
+    DstSel->setImm(NewOp);
+  } else {
+    DstSel->setImm(getDstSel());
+  }
   MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
   assert(DstUnused);
   DstUnused->setImm(getDstUnused());
@@ -510,7 +576,8 @@ bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
 }
 
 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
-                                           const SIInstrInfo *TII) {
+                                           const SIInstrInfo *TII,
+                                           bool Combine) {
   // MI should be moved right before v_or_b32.
   // For this we should clear all kill flags on uses of MI src-operands or else
   // we can encounter problem with use of killed operand.
@@ -535,7 +602,7 @@ bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
                  MI.getNumOperands() - 1);
 
   // Convert MI as any other SDWADstOperand and remove v_or_b32
-  return SDWADstOperand::convertToSDWA(MI, TII);
+  return SDWADstOperand::convertToSDWA(MI, TII, Combine);
 }
 
 std::optional<int64_t>
@@ -1029,6 +1096,50 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
   // Convert to sdwa
   int SDWAOpcode;
   unsigned Opcode = MI.getOpcode();
+
+  // If the MI is already SDWA, preserve any existing opsel
+  if (TII->isSDWA(Opcode)) {
+    auto SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(&MI);
+    MI.getParent()->insert(MI.getIterator(), SDWAInst);
+
+    // Apply all sdwa operand patterns.
+    bool Converted = false;
+    for (auto &Operand : SDWAOperands) {
+      LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
+      // There should be no intersection between SDWA operands and potential MIs
+      // e.g.:
+      // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
+      // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
+      // v_add_u32 v3, v4, v2
+      //
+      // In that example it is possible that we would fold 2nd instruction into
+      // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd
+      // (that was already destroyed). So if SDWAOperand is also a potential MI
+      // then do not apply it.
+      if (PotentialMatches.count(Operand->getParentInst()) == 0)
+        Converted |= Operand->convertToSDWA(*SDWAInst, TII, true);
+    }
+
+    if (Converted) {
+      ConvertedInstructions.push_back(SDWAInst);
+      for (MachineOperand &MO : SDWAInst->uses()) {
+        if (!MO.isReg())
+          continue;
+
+        MRI->clearKillFlags(MO.getReg());
+      }
+    } else {
+      SDWAInst->eraseFromParent();
+      return false;
+    }
+
+    LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
+    ++NumSDWAInstructionsPeepholed;
+
+    MI.eraseFromParent();
+    return true;
+  }
+
   if (TII->isSDWA(Opcode)) {
     SDWAOpcode = Opcode;
   } else {

>From 8d16c1cdde49f8f1e5073693c9820404d7afbc29 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Tue, 14 Jan 2025 11:20:53 -0500
Subject: [PATCH 02/10] [AMDGPU] Correct transformation and simplify
 combineSdwaSel

- Remove redundant "if".
- Replace arithmetic on SdwaSel type

The case distinction seems clearer and removes a mishandled case:
Since (SdwaSel)((unsigned)WORD_0 + 2) == DWORD,
the existing code led to the transformation:
     WORD_0 Sel (WORD_1 Sel (%X)) -> DWORD Sel (%X)
The correct transformation should be:
    WORD_0 Sel (WORD_1 Sel (%X)) -> WORD_1 Sel (%X)
---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index f515ba1aac5d06..4cbc6de30b4f19 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -122,15 +122,12 @@ class SDWAOperand {
       NewOp = FoldedOp;
       return true;
     }
+
     if (FoldedOp == SdwaSel::DWORD) {
       NewOp = ExistingOp;
       return true;
     }
 
-    if (FoldedOp != SdwaSel::WORD_0 && FoldedOp != SdwaSel::WORD_1 &&
-        FoldedOp != ExistingOp)
-      return false;
-
     if (ExistingOp == SdwaSel::WORD_1 || ExistingOp == SdwaSel::BYTE_2 ||
         ExistingOp == SdwaSel::BYTE_3)
       return false;
@@ -146,9 +143,15 @@ class SDWAOperand {
     }
 
     if (FoldedOp == SdwaSel::WORD_1) {
-      NewOp = (SdwaSel)((unsigned)ExistingOp + 2);
+      if (ExistingOp == SdwaSel::BYTE_0)
+        NewOp = SdwaSel::BYTE_2;
+      else if (ExistingOp == SdwaSel::BYTE_1)
+        NewOp = SdwaSel::BYTE_3;
+      else if (ExistingOp == SdwaSel::WORD_0)
+        NewOp = SdwaSel::WORD_1;
+
       return true;
-    }
+    }    
 
     return false;
   }

>From 20e23b697e789e07f642c2f3be297f5107d32eed Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Thu, 16 Jan 2025 04:17:32 -0500
Subject: [PATCH 03/10] [AMDGPU] Change formatting of combineSdwaSel

---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 4cbc6de30b4f19..8b9c7b9607dfd3 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -116,7 +116,8 @@ class SDWAOperand {
   /// SDWA selection in
   /// \p NewOp .
   /// For example, if we have existing BYTE_0 Sel and are attempting to fold
-  /// WORD_1 Sel: BYTE_0 Sel (WORD_1 Sel (%X)) -> BYTE_2 Sel (%X)
+  /// WORD_1 Sel:
+  ///     BYTE_0 Sel (WORD_1 Sel (%X)) -> BYTE_2 Sel (%X)
   bool combineSdwaSel(SdwaSel ExistingOp, SdwaSel FoldedOp, SdwaSel &NewOp) {
     if (ExistingOp == SdwaSel::DWORD) {
       NewOp = FoldedOp;
@@ -151,7 +152,7 @@ class SDWAOperand {
         NewOp = SdwaSel::WORD_1;
 
       return true;
-    }    
+    }
 
     return false;
   }

>From 663b94c8fceb7554be7935e168a4e660f6f82e44 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Thu, 16 Jan 2025 07:30:08 -0500
Subject: [PATCH 04/10] [AMDGPU] Remove dead branch from
 SIPeepholeSDWA::convertToSDWA

---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 8b9c7b9607dfd3..14c5cb730f3ee5 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1098,7 +1098,6 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
   LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
 
   // Convert to sdwa
-  int SDWAOpcode;
   unsigned Opcode = MI.getOpcode();
 
   // If the MI is already SDWA, preserve any existing opsel
@@ -1144,13 +1143,10 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
     return true;
   }
 
-  if (TII->isSDWA(Opcode)) {
-    SDWAOpcode = Opcode;
-  } else {
-    SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
-    if (SDWAOpcode == -1)
-      SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
-  }
+  assert(!TII->isSDWA(Opcode));
+  int SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
+  if (SDWAOpcode == -1)
+    SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
   assert(SDWAOpcode != -1);
 
   const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);

>From c2dfca063d1e06beadbd9205461b4f46c74c7dfe Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Thu, 16 Jan 2025 07:58:03 -0500
Subject: [PATCH 05/10] [AMDGPU] Extract SDWA instruction creation from
 convertToSDWA

---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 108 ++++++++++++----------
 1 file changed, 58 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 14c5cb730f3ee5..37aea94a3c5a83 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -62,6 +62,7 @@ class SIPeepholeSDWA {
   std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
   void pseudoOpConvertToVOP2(MachineInstr &MI,
                              const GCNSubtarget &ST) const;
+  MachineInstr *createSDWAVersion(MachineInstr &MI);
   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
   void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
 
@@ -1092,58 +1093,10 @@ bool isConvertibleToSDWA(MachineInstr &MI,
 }
 } // namespace
 
-bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
-                                   const SDWAOperandsVector &SDWAOperands) {
-
-  LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
-
-  // Convert to sdwa
+MachineInstr* SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) {
   unsigned Opcode = MI.getOpcode();
-
-  // If the MI is already SDWA, preserve any existing opsel
-  if (TII->isSDWA(Opcode)) {
-    auto SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(&MI);
-    MI.getParent()->insert(MI.getIterator(), SDWAInst);
-
-    // Apply all sdwa operand patterns.
-    bool Converted = false;
-    for (auto &Operand : SDWAOperands) {
-      LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
-      // There should be no intersection between SDWA operands and potential MIs
-      // e.g.:
-      // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
-      // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
-      // v_add_u32 v3, v4, v2
-      //
-      // In that example it is possible that we would fold 2nd instruction into
-      // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd
-      // (that was already destroyed). So if SDWAOperand is also a potential MI
-      // then do not apply it.
-      if (PotentialMatches.count(Operand->getParentInst()) == 0)
-        Converted |= Operand->convertToSDWA(*SDWAInst, TII, true);
-    }
-
-    if (Converted) {
-      ConvertedInstructions.push_back(SDWAInst);
-      for (MachineOperand &MO : SDWAInst->uses()) {
-        if (!MO.isReg())
-          continue;
-
-        MRI->clearKillFlags(MO.getReg());
-      }
-    } else {
-      SDWAInst->eraseFromParent();
-      return false;
-    }
-
-    LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
-    ++NumSDWAInstructionsPeepholed;
-
-    MI.eraseFromParent();
-    return true;
-  }
-
   assert(!TII->isSDWA(Opcode));
+
   int SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
   if (SDWAOpcode == -1)
     SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
@@ -1280,6 +1233,61 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
     SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
   }
 
+  return SDWAInst.getInstr();
+}
+
+bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
+                                   const SDWAOperandsVector &SDWAOperands) {
+  LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
+
+  // Convert to sdwa
+  unsigned Opcode = MI.getOpcode();
+
+  // If the MI is already SDWA, preserve any existing opsel
+  if (TII->isSDWA(Opcode)) {
+    auto SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(&MI);
+    MI.getParent()->insert(MI.getIterator(), SDWAInst);
+
+    // Apply all sdwa operand patterns.
+    bool Converted = false;
+    for (auto &Operand : SDWAOperands) {
+      LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
+      // There should be no intersection between SDWA operands and potential MIs
+      // e.g.:
+      // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
+      // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
+      // v_add_u32 v3, v4, v2
+      //
+      // In that example it is possible that we would fold 2nd instruction into
+      // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd
+      // (that was already destroyed). So if SDWAOperand is also a potential MI
+      // then do not apply it.
+      if (PotentialMatches.count(Operand->getParentInst()) == 0)
+        Converted |= Operand->convertToSDWA(*SDWAInst, TII, true);
+    }
+
+    if (Converted) {
+      ConvertedInstructions.push_back(SDWAInst);
+      for (MachineOperand &MO : SDWAInst->uses()) {
+        if (!MO.isReg())
+          continue;
+
+        MRI->clearKillFlags(MO.getReg());
+      }
+    } else {
+      SDWAInst->eraseFromParent();
+      return false;
+    }
+
+    LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
+    ++NumSDWAInstructionsPeepholed;
+
+    MI.eraseFromParent();
+    return true;
+  }
+
+  MachineInstr *SDWAInst{createSDWAVersion(MI)};
+
   // Apply all sdwa operand patterns.
   bool Converted = false;
   for (auto &Operand : SDWAOperands) {

>From 38bd038049bd6cf6f69161903af0340ee9297ad9 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Thu, 16 Jan 2025 08:18:56 -0500
Subject: [PATCH 06/10] [AMDGPU] Unify loops in SIPeepholeSDWA::convertToSDWA

There are  two loops that invoke the conversion on the operands
of the input instruction, one for the case where the instruction
is already an SDWA instruction and one for the case where it isn't.
The loops are almost the same.

Fuse those loops into a single loop.
---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 59 +++++------------------
 1 file changed, 13 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 37aea94a3c5a83..bbbff2083745da 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1240,54 +1240,21 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
                                    const SDWAOperandsVector &SDWAOperands) {
   LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
 
-  // Convert to sdwa
-  unsigned Opcode = MI.getOpcode();
-
-  // If the MI is already SDWA, preserve any existing opsel
-  if (TII->isSDWA(Opcode)) {
-    auto SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(&MI);
+  MachineInstr *SDWAInst;
+  bool CombineSelections;
+  if (TII->isSDWA(MI.getOpcode())) {
+    // No conversion necessary, since MI is an SDWA instruction.  But
+    // tell convertToSDWA below to combine selections of this instruction
+    // and its SDWA operands.
+    SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(&MI);
     MI.getParent()->insert(MI.getIterator(), SDWAInst);
-
-    // Apply all sdwa operand patterns.
-    bool Converted = false;
-    for (auto &Operand : SDWAOperands) {
-      LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
-      // There should be no intersection between SDWA operands and potential MIs
-      // e.g.:
-      // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
-      // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
-      // v_add_u32 v3, v4, v2
-      //
-      // In that example it is possible that we would fold 2nd instruction into
-      // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd
-      // (that was already destroyed). So if SDWAOperand is also a potential MI
-      // then do not apply it.
-      if (PotentialMatches.count(Operand->getParentInst()) == 0)
-        Converted |= Operand->convertToSDWA(*SDWAInst, TII, true);
-    }
-
-    if (Converted) {
-      ConvertedInstructions.push_back(SDWAInst);
-      for (MachineOperand &MO : SDWAInst->uses()) {
-        if (!MO.isReg())
-          continue;
-
-        MRI->clearKillFlags(MO.getReg());
-      }
-    } else {
-      SDWAInst->eraseFromParent();
-      return false;
-    }
-
-    LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
-    ++NumSDWAInstructionsPeepholed;
-
-    MI.eraseFromParent();
-    return true;
+    CombineSelections = true;
+  } else {
+    // Convert to sdwa
+    SDWAInst = createSDWAVersion(MI);
+    CombineSelections = false;
   }
 
-  MachineInstr *SDWAInst{createSDWAVersion(MI)};
-
   // Apply all sdwa operand patterns.
   bool Converted = false;
   for (auto &Operand : SDWAOperands) {
@@ -1303,7 +1270,7 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
     // was already destroyed). So if SDWAOperand is also a potential MI then do
     // not apply it.
     if (PotentialMatches.count(Operand->getParentInst()) == 0)
-      Converted |= Operand->convertToSDWA(*SDWAInst, TII);
+      Converted |= Operand->convertToSDWA(*SDWAInst, TII, CombineSelections);
   }
 
   if (Converted) {

>From e5923ac0328d1f62729778cffd5c4e70d72ac758 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Thu, 16 Jan 2025 08:28:45 -0500
Subject: [PATCH 07/10] [AMDGPU] Invert if statement in
 SIPeepholeSDWA::convertToSDWA

---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index bbbff2083745da..944d85f72bf6bb 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -1273,19 +1273,18 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
       Converted |= Operand->convertToSDWA(*SDWAInst, TII, CombineSelections);
   }
 
-  if (Converted) {
-    ConvertedInstructions.push_back(SDWAInst);
-    for (MachineOperand &MO : SDWAInst->uses()) {
-      if (!MO.isReg())
-        continue;
-
-      MRI->clearKillFlags(MO.getReg());
-    }
-  } else {
+  if (!Converted) {
     SDWAInst->eraseFromParent();
     return false;
   }
 
+  ConvertedInstructions.push_back(SDWAInst);
+  for (MachineOperand &MO : SDWAInst->uses()) {
+    if (!MO.isReg())
+      continue;
+
+    MRI->clearKillFlags(MO.getReg());
+  }
   LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
   ++NumSDWAInstructionsPeepholed;
 

>From 7034d2dc78a7884e52462ac5fd6d338d3817a72e Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Thu, 16 Jan 2025 08:39:37 -0500
Subject: [PATCH 08/10] [AMDGPU] Rename "Combine" to "CombineSelections" in
 SIPeepholeSDWA

---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 944d85f72bf6bb..f018be15155f72 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -106,7 +106,7 @@ class SDWAOperand {
                                            const GCNSubtarget &ST,
                                            SDWAOperandsMap *PotentialMatches = nullptr) = 0;
   virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII,
-                             bool Combine = false) = 0;
+                             bool CombineSelections = false) = 0;
 
   MachineOperand *getTargetOperand() const { return Target; }
   MachineOperand *getReplacedOperand() const { return Replaced; }
@@ -186,7 +186,7 @@ class SDWASrcOperand : public SDWAOperand {
                                    const GCNSubtarget &ST,
                                    SDWAOperandsMap *PotentialMatches = nullptr) override;
   bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII,
-                     bool Combine = false) override;
+                     bool CombineSelections = false) override;
 
   SdwaSel getSrcSel() const { return SrcSel; }
   bool getAbs() const { return Abs; }
@@ -216,7 +216,7 @@ class SDWADstOperand : public SDWAOperand {
                                    const GCNSubtarget &ST,
                                    SDWAOperandsMap *PotentialMatches = nullptr) override;
   bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII,
-                     bool Combine = false) override;
+                     bool CombineSelections = false) override;
 
   SdwaSel getDstSel() const { return DstSel; }
   DstUnused getDstUnused() const { return DstUn; }
@@ -237,7 +237,7 @@ class SDWADstPreserveOperand : public SDWADstOperand {
         Preserve(PreserveOp) {}
 
   bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII,
-                     bool Combine = false) override;
+                     bool CombineSelections = false) override;
 
   MachineOperand *getPreservedOperand() const { return Preserve; }
 
@@ -427,7 +427,7 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
 }
 
 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII,
-                                   bool Combine) {
+                                   bool CombineSelections) {
   switch (MI.getOpcode()) {
   case AMDGPU::V_CVT_F32_FP8_sdwa:
   case AMDGPU::V_CVT_F32_BF8_sdwa:
@@ -503,7 +503,7 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII,
   }
   copyRegOperand(*Src, *getTargetOperand());
   if (!IsPreserveSrc) {
-    if (Combine) {
+    if (CombineSelections) {
       SdwaSel NewOp;
       bool CanCombine =
           combineSdwaSel((SdwaSel)SrcSel->getImm(), getSrcSel(), NewOp);
@@ -541,7 +541,7 @@ MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII,
 }
 
 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII,
-                                   bool Combine) {
+                                   bool CombineSelections) {
   // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
 
   if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
@@ -560,7 +560,7 @@ bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII,
   copyRegOperand(*Operand, *getTargetOperand());
   MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
   assert(DstSel);
-  if (Combine) {
+  if (CombineSelections) {
     SdwaSel NewOp;
     bool CanCombine =
         combineSdwaSel((SdwaSel)DstSel->getImm(), getDstSel(), NewOp);
@@ -582,7 +582,7 @@ bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII,
 
 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
                                            const SIInstrInfo *TII,
-                                           bool Combine) {
+                                           bool CombineSelections) {
   // MI should be moved right before v_or_b32.
   // For this we should clear all kill flags on uses of MI src-operands or else
   // we can encounter problem with use of killed operand.
@@ -607,7 +607,7 @@ bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
                  MI.getNumOperands() - 1);
 
   // Convert MI as any other SDWADstOperand and remove v_or_b32
-  return SDWADstOperand::convertToSDWA(MI, TII, Combine);
+  return SDWADstOperand::convertToSDWA(MI, TII, CombineSelections);
 }
 
 std::optional<int64_t>

>From bbe9ab85865e2ff2c91c6e402d219bd9d533af38 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Thu, 16 Jan 2025 09:19:43 -0500
Subject: [PATCH 09/10] [AMDGPU] Change combineSdwaSel to use optional return
 type

---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 68 +++++++++--------------
 1 file changed, 27 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index f018be15155f72..ae8c614ddb3fda 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -113,49 +113,37 @@ class SDWAOperand {
   MachineInstr *getParentInst() const { return Target->getParent(); }
 
   /// Fold a \p FoldedOp SDWA selection into an \p ExistingOp existing SDWA
-  /// selection. If the selections are compatible, \p return true and store the
-  /// SDWA selection in
-  /// \p NewOp .
-  /// For example, if we have existing BYTE_0 Sel and are attempting to fold
-  /// WORD_1 Sel:
+  /// selection. If the selections are compatible, return the combined
+  /// selection, otherwise return a nullopt. For example, if we have existing
+  /// BYTE_0 Sel and are attempting to fold WORD_1 Sel:
   ///     BYTE_0 Sel (WORD_1 Sel (%X)) -> BYTE_2 Sel (%X)
-  bool combineSdwaSel(SdwaSel ExistingOp, SdwaSel FoldedOp, SdwaSel &NewOp) {
-    if (ExistingOp == SdwaSel::DWORD) {
-      NewOp = FoldedOp;
-      return true;
-    }
+  std::optional<SdwaSel> combineSdwaSel(SdwaSel ExistingOp, SdwaSel FoldedOp) {
+    if (ExistingOp == SdwaSel::DWORD)
+      return FoldedOp;
 
-    if (FoldedOp == SdwaSel::DWORD) {
-      NewOp = ExistingOp;
-      return true;
-    }
+    if (FoldedOp == SdwaSel::DWORD)
+      return ExistingOp;
 
     if (ExistingOp == SdwaSel::WORD_1 || ExistingOp == SdwaSel::BYTE_2 ||
         ExistingOp == SdwaSel::BYTE_3)
-      return false;
+      return {};
 
-    if (ExistingOp == FoldedOp) {
-      NewOp = ExistingOp;
-      return true;
-    }
+    if (ExistingOp == FoldedOp)
+      return ExistingOp;
 
-    if (FoldedOp == SdwaSel::WORD_0) {
-      NewOp = ExistingOp;
-      return true;
-    }
+    if (FoldedOp == SdwaSel::WORD_0)
+      return ExistingOp;
 
     if (FoldedOp == SdwaSel::WORD_1) {
       if (ExistingOp == SdwaSel::BYTE_0)
-        NewOp = SdwaSel::BYTE_2;
-      else if (ExistingOp == SdwaSel::BYTE_1)
-        NewOp = SdwaSel::BYTE_3;
-      else if (ExistingOp == SdwaSel::WORD_0)
-        NewOp = SdwaSel::WORD_1;
-
-      return true;
+        return SdwaSel::BYTE_2;
+      if (ExistingOp == SdwaSel::BYTE_1)
+        return SdwaSel::BYTE_3;
+      if (ExistingOp == SdwaSel::WORD_0)
+        return SdwaSel::WORD_1;
     }
 
-    return false;
+    return {};
   }
 
   MachineRegisterInfo *getMRI() const {
@@ -504,12 +492,11 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII,
   copyRegOperand(*Src, *getTargetOperand());
   if (!IsPreserveSrc) {
     if (CombineSelections) {
-      SdwaSel NewOp;
-      bool CanCombine =
-          combineSdwaSel((SdwaSel)SrcSel->getImm(), getSrcSel(), NewOp);
-      if (!CanCombine)
+      std::optional<SdwaSel> NewOp =
+          combineSdwaSel((SdwaSel)SrcSel->getImm(), getSrcSel());
+      if (!NewOp.has_value())
         return false;
-      SrcSel->setImm(NewOp);
+      SrcSel->setImm(NewOp.value());
     } else {
       SrcSel->setImm(getSrcSel());
     }
@@ -561,12 +548,11 @@ bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII,
   MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
   assert(DstSel);
   if (CombineSelections) {
-    SdwaSel NewOp;
-    bool CanCombine =
-        combineSdwaSel((SdwaSel)DstSel->getImm(), getDstSel(), NewOp);
-    if (!CanCombine)
+    std::optional<SdwaSel> NewOp =
+     combineSdwaSel((SdwaSel)DstSel->getImm(), getDstSel());
+    if (!NewOp.has_value())
       return false;
-    DstSel->setImm(NewOp);
+    DstSel->setImm(NewOp.value());
   } else {
     DstSel->setImm(getDstSel());
   }

>From 245c93bce25df06a000d7cc2e8d97ae828b55791 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Thu, 16 Jan 2025 11:15:12 -0500
Subject: [PATCH 10/10] [AMDGPU] Add regression test for invalid SDWA selection
 handling

---
 .../sdwa-peephole-instr-combine-sel.mir       | 124 ++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir

diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir
new file mode 100644
index 00000000000000..43708e9513c68b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir
@@ -0,0 +1,124 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=NOHAZARD %s
+
+---
+name:            sdwa_opsel_hazard
+body:             |
+  ; NOHAZARD-LABEL: name: sdwa_opsel_hazard
+  ; NOHAZARD: bb.0:
+  ; NOHAZARD-NEXT:   successors: %bb.7(0x40000000), %bb.8(0x40000000)
+  ; NOHAZARD-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6
+  ; NOHAZARD-NEXT: {{  $}}
+  ; NOHAZARD-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; NOHAZARD-NEXT:   [[DEF1:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+  ; NOHAZARD-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; NOHAZARD-NEXT:   [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[DEF1]], [[DEF2]], 0, 0, implicit $exec
+  ; NOHAZARD-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF undef [[DEF]], %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; NOHAZARD-NEXT:   S_BRANCH %bb.7
+  ; NOHAZARD-NEXT: {{  $}}
+  ; NOHAZARD-NEXT: bb.1:
+  ; NOHAZARD-NEXT:   successors: %bb.2(0x80000000)
+  ; NOHAZARD-NEXT: {{  $}}
+  ; NOHAZARD-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 255, implicit $exec
+  ; NOHAZARD-NEXT:   [[V_AND_B32_sdwa:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, undef [[GLOBAL_LOAD_DWORD_SADDR]], 0, [[V_MOV_B32_e32_]], 0, 6, 0, 5, 6, implicit $exec
+  ; NOHAZARD-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
+  ; NOHAZARD-NEXT:   [[V_LSHLREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_sdwa 0, [[V_MOV_B32_e32_1]], 0, undef [[GLOBAL_LOAD_DWORD_SADDR]], 0, 6, 0, 6, 2, implicit $exec
+  ; NOHAZARD-NEXT: {{  $}}
+  ; NOHAZARD-NEXT: bb.2:
+  ; NOHAZARD-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
+  ; NOHAZARD-NEXT: {{  $}}
+  ; NOHAZARD-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed undef %9, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; NOHAZARD-NEXT:   S_BRANCH %bb.3
+  ; NOHAZARD-NEXT: {{  $}}
+  ; NOHAZARD-NEXT: bb.3:
+  ; NOHAZARD-NEXT:   successors: %bb.4(0x80000000)
+  ; NOHAZARD-NEXT: {{  $}}
+  ; NOHAZARD-NEXT: bb.4:
+  ; NOHAZARD-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
+  ; NOHAZARD-NEXT: {{  $}}
+  ; NOHAZARD-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32 = SI_IF killed undef [[SI_IF1]], %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; NOHAZARD-NEXT:   S_BRANCH %bb.5
+  ; NOHAZARD-NEXT: {{  $}}
+  ; NOHAZARD-NEXT: bb.5:
+  ; NOHAZARD-NEXT:   successors: %bb.6(0x80000000)
+  ; NOHAZARD-NEXT: {{  $}}
+  ; NOHAZARD-NEXT: bb.6:
+  ; NOHAZARD-NEXT:   successors: %bb.9(0x40000000), %bb.10(0x40000000)
+  ; NOHAZARD-NEXT: {{  $}}
+  ; NOHAZARD-NEXT:   [[SI_IF3:%[0-9]+]]:sreg_32 = SI_IF undef [[DEF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; NOHAZARD-NEXT:   S_BRANCH %bb.9
+  ; NOHAZARD-NEXT: {{  $}}
+  ; NOHAZARD-NEXT: bb.7:
+  ; NOHAZARD-NEXT:   successors: %bb.8(0x80000000)
+  ; NOHAZARD-NEXT: {{  $}}
+  ; NOHAZARD-NEXT: bb.8:
+  ; NOHAZARD-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; NOHAZARD-NEXT: {{  $}}
+  ; NOHAZARD-NEXT:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+  ; NOHAZARD-NEXT:   [[SI_IF4:%[0-9]+]]:sreg_32 = SI_IF killed undef [[SI_IF]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; NOHAZARD-NEXT:   S_BRANCH %bb.1
+  ; NOHAZARD-NEXT: {{  $}}
+  ; NOHAZARD-NEXT: bb.9:
+  ; NOHAZARD-NEXT:   successors: %bb.10(0x80000000)
+  ; NOHAZARD-NEXT: {{  $}}
+  ; NOHAZARD-NEXT: bb.10:
+  ; NOHAZARD-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.7(0x40000000), %bb.8(0x40000000)
+    liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6
+
+    %0:sreg_32 = IMPLICIT_DEF
+    %1:sreg_64_xexec_xnull = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed %1, %2, 0, 0, implicit $exec
+    %4:sreg_32 = SI_IF undef %0, %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.7
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+    %5:vgpr_32 = V_AND_B32_e64 undef %6, 255, implicit $exec
+    %7:vgpr_32 = V_LSHLREV_B32_e64 2, killed undef %5, implicit $exec
+
+  bb.2:
+    successors: %bb.3(0x40000000), %bb.4(0x40000000)
+
+    %8:sreg_32 = SI_IF killed undef %9, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.3
+
+  bb.3:
+    successors: %bb.4(0x80000000)
+
+  bb.4:
+    successors: %bb.5(0x40000000), %bb.6(0x40000000)
+
+    %10:sreg_32 = SI_IF killed undef %8, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.5:
+    successors: %bb.6(0x80000000)
+
+  bb.6:
+    successors: %bb.9(0x40000000), %bb.10(0x40000000)
+
+    %11:sreg_32 = SI_IF undef %0, %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.9
+
+  bb.7:
+    successors: %bb.8(0x80000000)
+
+  bb.8:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+
+    %6:vgpr_32 = V_LSHRREV_B32_e64 16, undef %3, implicit $exec
+    %9:sreg_32 = SI_IF killed undef %4, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.9:
+    successors: %bb.10(0x80000000)
+
+  bb.10:
+    S_ENDPGM 0
+
+...
+