[llvm] [AMDGPU] Do not use original PHIs in coercion chains (PR #98063)

Mon Jul 8 11:35:57 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Jeffrey Byrnes (jrbyrnes)

<details>
<summary>Changes</summary>

It's possible that we are unable to coerce all the incoming values of a PHINode (A). Thus, we are unable to coerce the PHINode. In this situation, we previously would add the PHINode back to the ValMap. This would cause a problem is PhiNode (B) was a user of A. In this scenario, if B has been coerced, we would hit an assert regarding the incompatible type between the PHINode and its incoming value.

Deleting non-coerced PHINodes from the map resolves the issue.

---
Full diff: https://github.com/llvm/llvm-project/pull/98063.diff


2 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll (+83) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 2cc95f81d2f94d..88811a24e2fcd4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -369,7 +369,7 @@ bool LiveRegOptimizer::optimizeLiveType(
     if (MissingIncVal) {
       DeadInst = cast<Instruction>(ValMap[Phi]);
       // Do not use the dead phi
-      ValMap[Phi] = Phi;
+      ValMap.erase(Phi);
     }
     DeadInsts.emplace_back(DeadInst);
   }
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 441f00faf329e4..3202926e940746 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -866,5 +866,88 @@ bb.3:
   ret void
 }
 
+; This should not cause Assertion `getType() == V->getType() && "All operands to PHI node must be the same type as the PHI node
+; Note: whether or not the assertion fires depends on the iteration ortder of PhiNodes in AMDGPULateCodeGenPrepare, which
+; is non-deterministic due to iterators over a set of pointers.
+
+define amdgpu_kernel void @MissingInc_PhiChain(i1 %cmp1.i.i.i.i.i.not, <16 x i8> %promotealloca31.i.i.i.i) {
+; GFX906-LABEL: MissingInc_PhiChain:
+; GFX906:       ; %bb.0: ; %entry
+; GFX906-NEXT:    s_load_dword s2, s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX906-NEXT:    s_mov_b32 s10, 1
+; GFX906-NEXT:    v_mov_b32_e32 v4, 1
+; GFX906-NEXT:    s_mov_b32 s11, 1
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_bitcmp1_b32 s2, 0
+; GFX906-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX906-NEXT:    s_xor_b64 s[0:1], s[2:3], -1
+; GFX906-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX906-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
+; GFX906-NEXT:    s_branch .LBB14_2
+; GFX906-NEXT:  .LBB14_1: ; %if.end.1.i.i.i.i.i
+; GFX906-NEXT:    ; in Loop: Header=BB14_2 Depth=1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX906-NEXT:    s_mov_b32 s10, 0
+; GFX906-NEXT:    s_mov_b32 s11, 0
+; GFX906-NEXT:  .LBB14_2: ; %for.body10.i.i.i.i
+; GFX906-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX906-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX906-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX906-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX906-NEXT:    s_cbranch_vccnz .LBB14_4
+; GFX906-NEXT:  ; %bb.3: ; %if.then.i.i.i.i.i
+; GFX906-NEXT:    ; in Loop: Header=BB14_2 Depth=1
+; GFX906-NEXT:    v_lshlrev_b16_e64 v0, 8, s11
+; GFX906-NEXT:    v_or_b32_sdwa v0, s10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v4
+; GFX906-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX906-NEXT:    s_mov_b64 s[8:9], -1
+; GFX906-NEXT:  .LBB14_4: ; %Flow
+; GFX906-NEXT:    ; in Loop: Header=BB14_2 Depth=1
+; GFX906-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
+; GFX906-NEXT:    s_cbranch_vccnz .LBB14_7
+; GFX906-NEXT:  ; %bb.5: ; %if.end.i.i.i.i.i
+; GFX906-NEXT:    ; in Loop: Header=BB14_2 Depth=1
+; GFX906-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX906-NEXT:    s_cbranch_vccnz .LBB14_1
+; GFX906-NEXT:  ; %bb.6: ; %if.then.1.i.i.i.i.i
+; GFX906-NEXT:    ; in Loop: Header=BB14_2 Depth=1
+; GFX906-NEXT:    v_mov_b32_e32 v0, s4
+; GFX906-NEXT:    v_mov_b32_e32 v1, s5
+; GFX906-NEXT:    v_mov_b32_e32 v2, s6
+; GFX906-NEXT:    v_mov_b32_e32 v3, s7
+; GFX906-NEXT:    s_branch .LBB14_1
+; GFX906-NEXT:  .LBB14_7: ; in Loop: Header=BB14_2 Depth=1
+; GFX906-NEXT:    ; implicit-def: $vgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr10
+; GFX906-NEXT:    ; implicit-def: $sgpr11
+; GFX906-NEXT:    s_cbranch_execz .LBB14_2
+; GFX906-NEXT:  ; %bb.8: ; %DummyReturnBlock
+; GFX906-NEXT:    s_endpgm
+entry:
+  br label %for.body10.i.i.i.i
+
+for.body10.i.i.i.i:                               ; preds = %if.end.1.i.i.i.i.i, %entry
+  %promotealloca3237.i.i.i.i = phi <16 x i8> [ <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %entry ], [ %1, %if.end.1.i.i.i.i.i ]
+  br i1 %cmp1.i.i.i.i.i.not, label %if.end.i.i.i.i.i, label %if.then.i.i.i.i.i
+
+if.then.i.i.i.i.i:                                ; preds = %for.body10.i.i.i.i
+  %0 = insertelement <16 x i8> %promotealloca3237.i.i.i.i, i8 0, i64 0
+  br label %if.end.i.i.i.i.i
+
+if.end.i.i.i.i.i:                                 ; preds = %if.then.i.i.i.i.i, %for.body10.i.i.i.i
+  %promotealloca31.i.i.i.i3 = phi <16 x i8> [ %0, %if.then.i.i.i.i.i ], [ %promotealloca3237.i.i.i.i, %for.body10.i.i.i.i ]
+  br i1 %cmp1.i.i.i.i.i.not, label %if.end.1.i.i.i.i.i, label %if.then.1.i.i.i.i.i
+
+if.then.1.i.i.i.i.i:                              ; preds = %if.end.i.i.i.i.i
+  br label %if.end.1.i.i.i.i.i
+
+if.end.1.i.i.i.i.i:                               ; preds = %if.then.1.i.i.i.i.i, %if.end.i.i.i.i.i
+  %promotealloca30.i.i.i.i = phi <16 x i8> [ %promotealloca31.i.i.i.i, %if.then.1.i.i.i.i.i ], [ %promotealloca31.i.i.i.i3, %if.end.i.i.i.i.i ]
+  %1 = shufflevector <16 x i8> %promotealloca30.i.i.i.i, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  br label %for.body10.i.i.i.i
+}
+
 
 declare i32 @llvm.amdgcn.workitem.id.x()

``````````

</details>


https://github.com/llvm/llvm-project/pull/98063