[llvm-branch-commits] [llvm] AMDGPU: Back-propagate wqm for sources of side-effect instruction (PR #193395)

Tue Apr 21 20:16:29 PDT 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Ruiling, Song (ruiling)

<details>
<summary>Changes</summary>

For readfirstlane instruction, as it would get undefined value if exec is zero. To handle the case that only helper lanes execute the parent block, we let the readfirstlane to execute under wqm. But this is not enough. If the parent block was also executed by non-helper lanes, we also need to make sure its sources were calculated under wqm. Otherwise, if the instruction that generate the source of readfirstlane was executed under exact mode, the value would contain garbage data in help lane. The garbage data in helper lane maybe returned by the readfirstlane running under wqm.

To fix this issue, we need to enforce the back-propagation of wqm for instructions like readfirstlane. This was only done if the instruction was possibly in the middle of wqm region (by checking OutNeeds).

---
Full diff: https://github.com/llvm/llvm-project/pull/193395.diff


2 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (+33-4) 
- (modified) llvm/test/CodeGen/AMDGPU/wqm-propagate-for-execz-side-effect.ll (+3-3) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 5fd0c1e1064cb..e85e757a906f5 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -195,7 +195,8 @@ class SIWholeQuadMode {
                    std::vector<WorkItem> &Worklist);
   void markInstructionUses(const MachineInstr &MI, char Flag,
                            std::vector<WorkItem> &Worklist);
-  char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
+  char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist,
+                        SmallVector<MachineInstr *> &ExeczSideEffectInstrs);
   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
   char analyzeFunction(MachineFunction &MF);
@@ -482,8 +483,9 @@ void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
 
 // Scan instructions to determine which ones require an Exact execmask and
 // which ones seed WQM requirements.
-char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
-                                       std::vector<WorkItem> &Worklist) {
+char SIWholeQuadMode::scanInstructions(
+    MachineFunction &MF, std::vector<WorkItem> &Worklist,
+    SmallVector<MachineInstr *> &ExeczSideEffectInstrs) {
   char GlobalFlags = 0;
   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
@@ -495,6 +497,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
   // instruction as needing e.g. WQM before visiting it and realizing it needs
   // WQM disabled.
   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+
   for (MachineBasicBlock *MBB : RPOT) {
     BlockInfo &BBI = Blocks[MBB];
 
@@ -607,6 +610,18 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         }
       }
 
+      if (TII->hasUnwantedEffectsWhenEXECEmpty(MI)) {
+        for (auto &Op : MI.uses()) {
+          if (!Op.isReg())
+            continue;
+          if (!TRI->isVectorRegister(*MRI, Op.getReg()))
+            continue;
+
+          ExeczSideEffectInstrs.push_back(&MI);
+          break;
+        }
+      }
+
       if (Flags) {
         markInstruction(MI, Flags, Worklist);
         GlobalFlags |= Flags;
@@ -715,7 +730,8 @@ void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
 
 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
   std::vector<WorkItem> Worklist;
-  char GlobalFlags = scanInstructions(MF, Worklist);
+  SmallVector<MachineInstr *> ExeczSideEffectInstrs;
+  char GlobalFlags = scanInstructions(MF, Worklist, ExeczSideEffectInstrs);
 
   while (!Worklist.empty()) {
     WorkItem WI = Worklist.back();
@@ -725,6 +741,19 @@ char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
       propagateInstruction(*WI.MI, Worklist);
     else
       propagateBlock(*WI.MBB, Worklist);
+
+    if (Worklist.empty()) {
+      // Currently we let the instructions having sideeffect when execz to run
+      // under wqm, this avoids unwanted side-effect with exact mode if only
+      // helper lanes execute the parent block. At the same time, the wqm
+      // property should be back-propagated along the data-flow of their sources
+      // to ensure their sources have correct data for helper lanes.
+      for (auto *MI : ExeczSideEffectInstrs) {
+        InstrInfo II = Instructions[MI];
+        if (II.OutNeeds & StateWQM)
+          markInstructionUses(*MI, StateWQM, Worklist);
+      }
+    }
   }
 
   return GlobalFlags;
diff --git a/llvm/test/CodeGen/AMDGPU/wqm-propagate-for-execz-side-effect.ll b/llvm/test/CodeGen/AMDGPU/wqm-propagate-for-execz-side-effect.ll
index 655dc20053daa..1166817b7ffd8 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm-propagate-for-execz-side-effect.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm-propagate-for-execz-side-effect.ll
@@ -31,13 +31,13 @@ define amdgpu_ps float @execz_side_effect_wqm_propagation(float %FragCoordY, i32
 ; CHECK-NEXT:    s_mov_b32 s15, s0
 ; CHECK-NEXT:    image_sample v2, [v0, v0, v3], s[8:15], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_CUBE
 ; CHECK-NEXT:    image_atomic_add_uint v0, [v0, v0], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; CHECK-NEXT:    v_med3_num_f32 v3, 0, 0, 0
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s6
+; CHECK-NEXT:    v_med3_num_f32 v3, 0, 0, 0
 ; CHECK-NEXT:    s_buffer_load_b32 s6, s[0:3], 0x0
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_3)
-; CHECK-NEXT:    v_readfirstlane_b32 s7, v3
 ; CHECK-NEXT:    s_wait_samplecnt 0x0
 ; CHECK-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_3)
+; CHECK-NEXT:    v_readfirstlane_b32 s7, v3
 ; CHECK-NEXT:    s_mul_f32 s7, s7, 0
 ; CHECK-NEXT:    s_mul_f32 s7, s7, 0
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)

``````````

</details>


https://github.com/llvm/llvm-project/pull/193395