[llvm-branch-commits] [llvm] [AMDGPU] DPP wave reduction for long types - 2 (PR #189225)

Tue Apr 14 02:21:17 PDT 2026

================
@@ -1009,6 +1009,943 @@ entry:
   ret void
 }
 
+define void @divergent_value_dpp_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
----------------
easyonaadit wrote:

I didn't end up tracking it down completely yet, but this is being done by the two-address instruction pass. I'll have to look into it.

https://github.com/llvm/llvm-project/pull/189225