[llvm] [AMDGPU] V_SET_INACTIVE optimizations (PR #98864)

Mon Jul 15 00:14:51 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-globalisel

Author: Carl Ritson (perlfu)

<details>
<summary>Changes</summary>

Optimize V_SET_INACTIVE by allow it to run in WWM.
Hence WWM sections are not broken up for inactive lane setting.
WWM V_SET_INACTIVE can typically be lower to V_CNDMASK.
Some cases require use of exec manipulation V_MOV as previous code.
GFX9 sees slight instruction count increase in edge cases due to
smaller constant bus.

Additionally avoid introducing exec manipulation and V_MOVs where
a source of V_SET_INACTIVE is the destination.
This is a common pattern as WWM register pre-allocation often
assigns the same register.

---

Patch is 245.52 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/98864.diff


17 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+140-29) 
- (modified) llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (+57-1) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll (+56-72) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll (+25-41) 
- (modified) llvm/test/CodeGen/AMDGPU/cse-convergent.ll (+2-12) 
- (modified) llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll (+6-12) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+95-205) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+90-156) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+90-156) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+95-205) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll (+110-77) 
- (modified) llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll (+2-10) 
- (modified) llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll (+1-4) 
- (modified) llvm/test/CodeGen/AMDGPU/wqm.ll (+12-36) 
- (modified) llvm/test/CodeGen/AMDGPU/wqm.mir (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll (+110-102) 
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved.ll (+220-242) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index cc1b9ac0c9ecd..d551a7887e706 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2273,37 +2273,148 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.eraseFromParent();
     break;
   }
-  case AMDGPU::V_SET_INACTIVE_B32: {
-    unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
-    unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-    // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
-    // optimizations (mainly Register Coalescer) aware of WWM register liveness.
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
-        .add(MI.getOperand(1));
-    auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
-    FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
-      .add(MI.getOperand(2));
-    BuildMI(MBB, MI, DL, get(NotOpc), Exec)
-      .addReg(Exec);
-    MI.eraseFromParent();
-    break;
-  }
+  case AMDGPU::V_SET_INACTIVE_B32:
   case AMDGPU::V_SET_INACTIVE_B64: {
     unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
-    unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-    MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
-                                 MI.getOperand(0).getReg())
-                             .add(MI.getOperand(1));
-    expandPostRAPseudo(*Copy);
-    auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
-    FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
-    Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
-                   MI.getOperand(0).getReg())
-               .add(MI.getOperand(2));
-    expandPostRAPseudo(*Copy);
-    BuildMI(MBB, MI, DL, get(NotOpc), Exec)
-      .addReg(Exec);
+    unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+    unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
+                           ? AMDGPU::V_MOV_B64_PSEUDO
+                           : AMDGPU::V_MOV_B32_e32;
+    Register ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+
+    Register DstReg = MI.getOperand(0).getReg();
+    MachineOperand &ActiveSrc = MI.getOperand(1);
+    MachineOperand &InactiveSrc = MI.getOperand(2);
+
+    bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
+
+    // Find implicit exec src if this is running in WWM.
+    Register ExecSrcReg = 0;
+    for (auto &Op : MI.implicit_operands()) {
+      if (Op.isDef() || !Op.isReg())
+        continue;
+      Register OpReg = Op.getReg();
+      if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
+          OpReg == AMDGPU::SCC)
+        continue;
+      ExecSrcReg = OpReg;
+      break;
+    }
+
+    // Ideally in WWM this operation is lowered to V_CNDMASK; however,
+    // constant bus constraints and the presence of literal constants
+    // present an issue.
+    // Fallback to V_MOV base lowering in all but the common cases.
+    bool InWWM = !!ExecSrcReg;
+    bool UseVCndMask = false;
+    if (InWWM) {
+      const MachineFunction *MF = MI.getParent()->getParent();
+      const MachineRegisterInfo &MRI = MF->getRegInfo();
+      const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
+      const MCInstrDesc &Desc = get(Opcode);
+      int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+      int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+      int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
+      int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
+      int ConstantBusUses = 1; // Starts at one for ExecRegSrc
+      int LiteralConstants = 0;
+      ConstantBusUses +=
+          usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0;
+      ConstantBusUses +=
+          usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0;
+      LiteralConstants +=
+          ActiveSrc.isImm() &&
+                  !isInlineConstant(ActiveSrc, Desc.operands()[Src1Idx])
+              ? 1
+              : 0;
+      LiteralConstants +=
+          InactiveSrc.isImm() &&
+                  !isInlineConstant(InactiveSrc, Desc.operands()[Src0Idx])
+              ? 1
+              : 0;
+      UseVCndMask = ConstantBusUses <= ConstantBusLimit &&
+                    LiteralConstants <= LiteralLimit &&
+                    (!VMov64 || (ActiveSrc.isReg() && InactiveSrc.isReg()));
+    }
+
+    if (UseVCndMask && VMov64) {
+      // WWM B64; decompose to two B32 operations.
+      // Test above ensures that both sources are registers.
+      // Note: this is done to avoid falling back to V_MOV multiple times
+      // and introducing exec manipulation for each VGPR separately.
+      assert(ActiveSrc.isReg() && InactiveSrc.isReg());
+      Register ActiveLo = RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub0);
+      Register ActiveHi = RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub1);
+      Register InactiveLo = RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub0);
+      Register InactiveHi = RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub1);
+      MachineInstr *Tmp;
+      Tmp = BuildMI(MBB, MI, DL, get(AMDGPU::V_SET_INACTIVE_B32),
+                    RI.getSubReg(DstReg, AMDGPU::sub0))
+                .addReg(InactiveLo)
+                .addReg(ActiveLo)
+                .addReg(ExecSrcReg, RegState::Implicit)
+                .addReg(DstReg, RegState::ImplicitDefine);
+      expandPostRAPseudo(*Tmp);
+      Tmp = BuildMI(MBB, MI, DL, get(AMDGPU::V_SET_INACTIVE_B32),
+                    RI.getSubReg(DstReg, AMDGPU::sub1))
+                .addReg(InactiveHi, InactiveSrc.isKill() ? RegState::Kill : 0)
+                .addReg(ActiveHi, ActiveSrc.isKill() ? RegState::Kill : 0)
+                .addReg(ExecSrcReg, RegState::Implicit)
+                .addReg(DstReg, RegState::ImplicitDefine);
+      expandPostRAPseudo(*Tmp);
+    } else if (UseVCndMask) {
+      // WWM B32; use V_CNDMASK.
+      MachineInstr *VCndMask =
+          BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+              .addImm(0)
+              .add(InactiveSrc)
+              .addImm(0)
+              .add(ActiveSrc)
+              .addReg(ExecSrcReg);
+      // Copy implicit defs in case this is part of V_SET_INACTIVE_B64.
+      for (auto &Op : MI.implicit_operands()) {
+        if (!Op.isDef())
+          continue;
+        VCndMask->addOperand(Op);
+      }
+    } else {
+      // Fallback V_MOV case.
+      // Avoid unnecessary work if a src is the destination.
+      // This can happen if WWM register allocation was efficient.
+      bool SkipActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
+      bool SkipInactive = InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
+      if (!SkipActive) {
+        if (InWWM) {
+          // Cancel WWM
+          BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg);
+        }
+        // Copy active lanes
+        MachineInstr *VMov =
+            BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg())
+                .add(ActiveSrc);
+        if (VMov64)
+          expandPostRAPseudo(*VMov);
+      }
+      if (!SkipInactive) {
+        // Set exec mask to inactive lanes
+        MachineInstr *ExecMI = BuildMI(MBB, MI, DL, get(NotOpc), ExecReg)
+                                   .addReg(InWWM ? ExecSrcReg : ExecReg);
+        ExecMI->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
+        // Copy inactive lanes
+        MachineInstr *VMov =
+            BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc);
+        if (VMov64)
+          expandPostRAPseudo(*VMov);
+        if (!InWWM) {
+          // Restore original exec mask
+          BuildMI(MBB, MI, DL, get(NotOpc), ExecReg).addReg(ExecReg);
+        }
+      }
+      if (InWWM) {
+        // Restore WWM
+        BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1);
+      }
+    }
     MI.eraseFromParent();
     break;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 19e761ef45b25..e01c045e7ef3d 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -178,6 +178,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
   SmallVector<MachineInstr *, 4> KillInstrs;
   SmallVector<MachineInstr *, 4> InitExecInstrs;
+  SmallVector<MachineInstr *, 4> SetInactiveInstrs;
 
   void printInfo();
 
@@ -225,6 +226,8 @@ class SIWholeQuadMode : public MachineFunctionPass {
   void lowerInitExec(MachineInstr &MI);
   MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry);
 
+  void harmonizeTransitions();
+
 public:
   static char ID;
 
@@ -477,7 +480,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
                                        std::vector<WorkItem> &Worklist) {
   char GlobalFlags = 0;
   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
-  SmallVector<MachineInstr *, 4> SetInactiveInstrs;
   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
   bool HasImplicitDerivatives =
       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
@@ -554,6 +556,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         GlobalFlags |= StateStrictWQM;
       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
+        // Disable strict states here while marking, relax it later.
         III.Disabled = StateStrict;
         MachineOperand &Inactive = MI.getOperand(2);
         if (Inactive.isReg()) {
@@ -564,6 +567,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
           }
         }
         SetInactiveInstrs.push_back(&MI);
+        GlobalFlags |= StateStrictWWM;
+        BBI.NeedsLowering = true;
       } else if (TII->isDisableWQM(MI)) {
         BBI.Needs |= StateExact;
         if (!(BBI.InNeeds & StateExact)) {
@@ -1037,6 +1042,7 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
   LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
 
   SmallVector<MachineInstr *, 4> SplitPoints;
+  Register ActiveLanesReg = 0;
   char State = BI.InitialState;
 
   for (MachineInstr &MI : llvm::make_early_inc_range(
@@ -1053,6 +1059,20 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
       SplitPoint = lowerKillF32(MBB, MI);
       break;
+    case AMDGPU::ENTER_STRICT_WWM:
+      ActiveLanesReg = MI.getOperand(0).getReg();
+      break;
+    case AMDGPU::EXIT_STRICT_WWM:
+      ActiveLanesReg = 0;
+      break;
+    case AMDGPU::V_SET_INACTIVE_B32:
+    case AMDGPU::V_SET_INACTIVE_B64:
+      if (ActiveLanesReg) {
+        MI.addOperand(*MBB.getParent(),
+                      MachineOperand::CreateReg(ActiveLanesReg, false, true));
+      } else
+        assert(State == StateExact || State == StateWQM);
+      break;
     default:
       break;
     }
@@ -1617,6 +1637,40 @@ SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry) {
   return InsertPt;
 }
 
+void SIWholeQuadMode::harmonizeTransitions() {
+  // Relax requirements on SET_INACTIVE to allow it in WWM regions.
+  for (MachineInstr *MI : SetInactiveInstrs) {
+    if (MI->getOpcode() == AMDGPU::COPY)
+      continue;
+
+    Instructions[MI].Disabled &= ~StateStrictWWM;
+
+    auto MBB = MI->getParent();
+    auto It = MI->getIterator();
+    if (It == MBB->end())
+      continue;
+
+    bool AddWWM = false;
+    auto NextMI = std::next(It);
+    if (NextMI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
+        NextMI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
+      // Groups of SET_INACTIVE are more efficient in WWM.
+      AddWWM = true;
+    } else {
+      // Back propagate WWM needs of next instruction.
+      auto III = Instructions.find(&*NextMI);
+      AddWWM =
+          (III != Instructions.end() && III->second.Needs & StateStrictWWM);
+    }
+
+    if (!AddWWM)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "merge into WWM: " << *MI);
+    Instructions[MI].Needs |= StateStrictWWM;
+  }
+}
+
 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
                     << " ------------- \n");
@@ -1629,6 +1683,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   LowerToMovInstrs.clear();
   KillInstrs.clear();
   InitExecInstrs.clear();
+  SetInactiveInstrs.clear();
   StateTransition.clear();
 
   ST = &MF.getSubtarget<GCNSubtarget>();
@@ -1701,6 +1756,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
     LIS->InsertMachineInstrInMaps(*MI);
     lowerKillInstrs(true);
   } else {
+    harmonizeTransitions();
     for (auto BII : Blocks)
       processBlock(*BII.first, BII.first == &Entry);
     // Lowering blocks causes block splitting so perform as a second pass.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 0c60be9d94591..8fb4f2cd79a70 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -96,15 +96,14 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
 define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
 ; GCN-LABEL: set_inactive_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x40400000
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -117,17 +116,15 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 ; GCN-LABEL: set_inactive_f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 0xcccccccd
-; GCN-NEXT:    s_mov_b32 s5, 0x4010cccc
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s6, 0xcccccccd
+; GCN-NEXT:    s_mov_b32 s7, 0x4010cccc
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -140,15 +137,14 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
 define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
 ; GCN-LABEL: set_inactive_v2i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x10001
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x10001
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -160,15 +156,14 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
 define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
 ; GCN-LABEL: set_inactive_v2f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x3c003c00
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c003c00
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -181,17 +176,15 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
 ; GCN-LABEL: set_inactive_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 1
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s6, 1
+; GCN-NEXT:    s_mov_b32 s7, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -205,17 +198,15 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
 ; GCN-LABEL: set_inactive_v2f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 1.0
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s6, 1.0
+; GCN-NEXT:    s_mov_b32 s7, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v1, v3
-; GCN-NEXT:    s_not_b64 exec, exec
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -228,15 +219,14 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
 define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
 ; GCN-LABEL: set_inactive_v2bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x3f803f80
+; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
+; GCN-NEXT:    s_mov_b64 exec, s[2:3]
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-NEXT:    s_not_b64 exec, exec
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_not_b64 exec, exec
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -249,17 +239,15 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
 ; GCN-LABEL: set_inactive_v4i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s4, 0x10001
-; GCN-NEXT:    s_mov...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/98864