[llvm-branch-commits] [llvm] [AMDGPU] DPP wave reduction for double types - 1 (PR #189390)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Mar 30 07:11:46 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Aaditya (easyonaadit)
<details>
<summary>Changes</summary>
Supported Ops: `fmin` and `fmax`
---
Patch is 272.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/189390.diff
3 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+27-13)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll (+1112-234)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll (+1112-234)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index dc7f0906159c4..dcc342638c5c1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5741,6 +5741,10 @@ getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST) {
case AMDGPU::S_AND_B64:
case AMDGPU::S_OR_B64:
case AMDGPU::S_XOR_B64:
+ case AMDGPU::V_MIN_NUM_F64_e64:
+ case AMDGPU::V_MIN_F64_e64:
+ case AMDGPU::V_MAX_NUM_F64_e64:
+ case AMDGPU::V_MAX_F64_e64:
DPPOpc = AMDGPU::V_MOV_B64_DPP_PSEUDO;
break;
default:
@@ -6308,10 +6312,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
unsigned DPPCtrl) {
auto DPPInstr =
BuildMI(*CurrBB, MI, DL, TII->get(DPPOpc), Dst).addReg(Src); // old
- if (isFPOp)
+ if (isFPOp && !NeedsMovDPP)
DPPInstr.addImm(SISrcMods::NONE); // src0 modifier
DPPInstr.addReg(Src); // src0
- if (isFPOp)
+ if (isFPOp && !NeedsMovDPP)
DPPInstr.addImm(SISrcMods::NONE); // src1 modifier
if (!NeedsMovDPP)
DPPInstr.addReg(Src); // src1
@@ -6378,17 +6382,27 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
CarryReg);
BuildRegSequence(*CurrBB, MI, ReturnReg, ResLo, ResHi);
} else {
- Register CmpMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
- BuildMI(*CurrBB, MI, DL, TII->get(Opc), CmpMaskReg)
- .addReg(Src0) // src0
- .addReg(Src1); // src1
- LastBcastInstr =
- BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B64_PSEUDO),
- ReturnReg)
- .addReg(Src1) // src0
- .addReg(Src0) // src1
- .addReg(CmpMaskReg); // src2
- CurrBB = Expand64BitV_CND_MASK(*LastBcastInstr, CurrBB);
+ if (isFPOp) {
+ BuildMI(*CurrBB, MI, DL, TII->get(Opc), ReturnReg)
+ .addImm(SISrcMods::NONE) // src0 modifiers
+ .addReg(Src0)
+ .addImm(SISrcMods::NONE) // src1 modifiers
+ .addReg(Src1)
+ .addImm(SISrcMods::NONE) // clamp
+ .addImm(SISrcMods::NONE); // omod
+ } else {
+ Register CmpMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ BuildMI(*CurrBB, MI, DL, TII->get(Opc), CmpMaskReg)
+ .addReg(Src0) // src0
+ .addReg(Src1); // src1
+ LastBcastInstr =
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B64_PSEUDO),
+ ReturnReg)
+ .addReg(Src1) // src0
+ .addReg(Src0) // src1
+ .addReg(CmpMaskReg); // src2
+ CurrBB = Expand64BitV_CND_MASK(*LastBcastInstr, CurrBB);
+ }
}
return ReturnReg;
};
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll
index fbe099b46dc21..1e151ccf20c36 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll
@@ -858,6 +858,884 @@ entry:
ret void
}
+define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) {
+; GFX8DAGISEL-LABEL: divergent_value_double_dpp:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, 0x7ff80000
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v2, s[4:5]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v8, v6
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_max_f64 v[4:5], v[5:6], v[7:8]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX8DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_double_dpp:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v4, 0x7ff80000
+; GFX8GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v2, s[4:5]
+; GFX8GISEL-NEXT: v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: v_mov_b32_e32 v8, v6
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_max_f64 v[4:5], v[5:6], v[7:8]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX8GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_double_dpp:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, 0x7ff80000
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v2, s[4:5]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v8, v6
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_max_f64 v[4:5], v[5:6], v[7:8]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX9DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_double_dpp:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: v_mov_b32_e32 v4, 0x7ff80000
+; GFX9GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v2, s[4:5]
+; GFX9GISEL-NEXT: v_cndmask_b32_e64 v6, v4, v3, s[4:5]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: v_mov_b32_e32 v8, v6
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v8, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_max_f64 v[4:5], v[5:6], v[7:8]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT: s_nop 0
+; GFX9GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b32_e3...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/189390
More information about the llvm-branch-commits
mailing list