[llvm-branch-commits] [llvm] [AMDGPU] DPP wave reduction for long types - 3 (PR #189226)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sun Mar 29 04:11:27 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Aaditya (easyonaadit)
<details>
<summary>Changes</summary>
Supported Ops: `and`, `or`, `xor`
---
Patch is 274.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/189226.diff
4 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+12-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll (+960-108)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll (+960-108)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll (+984-132)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4eb6bad007d59..dc7f0906159c4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5738,6 +5738,9 @@ getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST) {
case AMDGPU::V_CMP_GT_I64_e64: // max.i64
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO:
+ case AMDGPU::S_AND_B64:
+ case AMDGPU::S_OR_B64:
+ case AMDGPU::S_XOR_B64:
DPPOpc = AMDGPU::V_MOV_B64_DPP_PSEUDO;
break;
default:
@@ -5749,6 +5752,12 @@ getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST) {
ClampOpc = AMDGPU::S_ADD_I32;
if (Opc == AMDGPU::S_ADD_U64_PSEUDO || Opc == AMDGPU::S_SUB_U64_PSEUDO)
ClampOpc = AMDGPU::V_ADD_CO_U32_e64;
+ else if (Opc == AMDGPU::S_AND_B64)
+ ClampOpc = AMDGPU::V_AND_B32_e64;
+ else if (Opc == AMDGPU::S_OR_B64)
+ ClampOpc = AMDGPU::V_OR_B32_e64;
+ else if (Opc == AMDGPU::S_XOR_B64)
+ ClampOpc = AMDGPU::V_XOR_B32_e64;
else
ClampOpc = ST.getInstrInfo()->getVALUOp(ClampOpc);
}
@@ -6349,8 +6358,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
auto BuildPostDPPInstr = [&](Register Src0, Register Src1) {
bool isAddSubOpc =
Opc == AMDGPU::S_ADD_U64_PSEUDO || Opc == AMDGPU::S_SUB_U64_PSEUDO;
+ bool isBitWiseOpc = Opc == AMDGPU::S_AND_B64 ||
+ Opc == AMDGPU::S_OR_B64 || Opc == AMDGPU::S_XOR_B64;
Register ReturnReg = MRI.createVirtualRegister(SrcRegClass);
- if (isAddSubOpc) {
+ if (isAddSubOpc || isBitWiseOpc) {
Register ResLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register ResHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MachineOperand Src0Operand =
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
index a767a6ab72828..ca82764a788f9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
@@ -812,6 +812,858 @@ entry:
ret void
}
+define void @divergent_value_dpp_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s[4:5]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v5, -1, v3, s[4:5]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX8DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_dpp_i64:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s[4:5]
+; GFX8GISEL-NEXT: v_cndmask_b32_e64 v5, -1, v3, s[4:5]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX8GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s[4:5]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v5, -1, v3, s[4:5]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX9DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_dpp_i64:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s[4:5]
+; GFX9GISEL-NEXT: v_cndmask_b32_e64 v5, -1, v3, s[4:5]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX9GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX9GISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v4, -1, v2, s[4:5]
+; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v5, -1, v3, s[4:5]
+; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v8, 32, v8
+; GFX1064DAGISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v8, 4, v8
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_and_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT: v_and_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/189226
More information about the llvm-branch-commits
mailing list