[llvm-branch-commits] [llvm] [AMDGPU] DPP wave reduction for long types - 2 (PR #189225)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sun Mar 29 04:10:58 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Aaditya (easyonaadit)
<details>
<summary>Changes</summary>
Supported Ops: `add`, `sub`
---
Patch is 232.86 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/189225.diff
3 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+73-20)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll (+1079-142)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll (+1113-146)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index dfec8aaf56767..4eb6bad007d59 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5736,6 +5736,8 @@ getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST) {
case AMDGPU::V_CMP_LT_I64_e64: // min.i64
case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
case AMDGPU::V_CMP_GT_I64_e64: // max.i64
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_U64_PSEUDO:
DPPOpc = AMDGPU::V_MOV_B64_DPP_PSEUDO;
break;
default:
@@ -5745,7 +5747,10 @@ getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST) {
if (!ST.getInstrInfo()->isVALU(Opc)) {
if (Opc == AMDGPU::S_SUB_I32)
ClampOpc = AMDGPU::S_ADD_I32;
- ClampOpc = ST.getInstrInfo()->getVALUOp(ClampOpc);
+ if (Opc == AMDGPU::S_ADD_U64_PSEUDO || Opc == AMDGPU::S_SUB_U64_PSEUDO)
+ ClampOpc = AMDGPU::V_ADD_CO_U32_e64;
+ else
+ ClampOpc = ST.getInstrInfo()->getVALUOp(ClampOpc);
}
return {DPPOpc, ClampOpc};
}
@@ -6301,40 +6306,80 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
DPPInstr.addImm(SISrcMods::NONE); // src1 modifier
if (!NeedsMovDPP)
DPPInstr.addReg(Src); // src1
+ if (AMDGPU::getNamedOperandIdx(DPPOpc, AMDGPU::OpName::clamp) >= 0)
+ DPPInstr.addImm(0); // clamp
DPPInstr
.addImm(DPPCtrl) // dpp-ctrl
.addImm(0xf) // row-mask
.addImm(0xf) // bank-mask
.addImm(0); // bound-control
};
- auto BuildClampInstr = [&](Register Dst, Register Src0, Register Src1) {
- auto ClampInstr = BuildMI(*CurrBB, MI, DL, TII->get(ClampOpc), Dst);
+ auto BuildClampInstr = [&](Register Dst, Register Src0, Register Src1,
+ bool isAddSub = false,
+ bool needsCarryIn = false,
+ Register CarryIn = Register()) {
+ unsigned InstrOpc = ClampOpc;
+ Register CarryOutReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ if (needsCarryIn)
+ InstrOpc = AMDGPU::V_ADDC_U32_e64;
+ auto ClampInstr = BuildMI(*CurrBB, MI, DL, TII->get(InstrOpc), Dst);
if (isFPOp)
ClampInstr.addImm(SISrcMods::NONE); // src0 mod
+ if (isAddSub) {
+ if (needsCarryIn)
+ ClampInstr.addReg(CarryOutReg,
+ RegState::Define |
+ RegState::Dead); // killed carry-out reg
+ else
+ ClampInstr.addReg(CarryOutReg, RegState::Define); // carry-out reg
+ }
ClampInstr.addReg(Src0); // src0
if (isFPOp)
ClampInstr.addImm(SISrcMods::NONE); // src1 mod
ClampInstr.addReg(Src1); // src1
- if (TII->hasIntClamp(*ClampInstr) || TII->hasFPClamp(*ClampInstr))
+ if (needsCarryIn)
+ ClampInstr.addReg(CarryIn, RegState::Kill); // carry-in reg
+ if (AMDGPU::getNamedOperandIdx(InstrOpc, AMDGPU::OpName::clamp) >= 0)
ClampInstr.addImm(0); // clamp
if (isFPOp)
ClampInstr.addImm(0); // omod
LastBcastInstr = ClampInstr;
+ return CarryOutReg;
};
auto BuildPostDPPInstr = [&](Register Src0, Register Src1) {
- Register CmpMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
- Register MinMaxResultReg = MRI.createVirtualRegister(SrcRegClass);
- BuildMI(*CurrBB, MI, DL, TII->get(Opc), CmpMaskReg)
- .addReg(Src0) // src0
- .addReg(Src1); // src1
- LastBcastInstr =
- BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B64_PSEUDO),
- MinMaxResultReg)
- .addReg(Src1) // src0
- .addReg(Src0) // src1
- .addReg(CmpMaskReg); // src2
- CurrBB = Expand64BitV_CND_MASK(*LastBcastInstr, CurrBB);
- return MinMaxResultReg;
+ bool isAddSubOpc =
+ Opc == AMDGPU::S_ADD_U64_PSEUDO || Opc == AMDGPU::S_SUB_U64_PSEUDO;
+ Register ReturnReg = MRI.createVirtualRegister(SrcRegClass);
+ if (isAddSubOpc) {
+ Register ResLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ResHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MachineOperand Src0Operand =
+ MachineOperand::CreateReg(Src0, /*isDef=*/false);
+ MachineOperand Src1Operand =
+ MachineOperand::CreateReg(Src1, /*isDef=*/false);
+ auto [Src0Lo, Src0Hi] = ExtractSubRegs(MI, Src0Operand, SrcRegClass);
+ auto [Src1Lo, Src1Hi] = ExtractSubRegs(MI, Src1Operand, SrcRegClass);
+ Register CarryReg =
+ BuildClampInstr(ResLo, Src0Lo.getReg(), Src1Lo.getReg(),
+ isAddSubOpc, /*needsCarryIn*/ false);
+ BuildClampInstr(ResHi, Src0Hi.getReg(), Src1Hi.getReg(), isAddSubOpc,
+ /*needsCarryIn*/ isAddSubOpc ? true : false,
+ CarryReg);
+ BuildRegSequence(*CurrBB, MI, ReturnReg, ResLo, ResHi);
+ } else {
+ Register CmpMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ BuildMI(*CurrBB, MI, DL, TII->get(Opc), CmpMaskReg)
+ .addReg(Src0) // src0
+ .addReg(Src1); // src1
+ LastBcastInstr =
+ BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B64_PSEUDO),
+ ReturnReg)
+ .addReg(Src1) // src0
+ .addReg(Src0) // src1
+ .addReg(CmpMaskReg); // src2
+ CurrBB = Expand64BitV_CND_MASK(*LastBcastInstr, CurrBB);
+ }
+ return ReturnReg;
};
// Set inactive lanes to the identity value.
@@ -6549,14 +6594,22 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
BuildRegSequence(*CurrBB, MI, ReducedValSGPR, LaneValueLoReg,
LaneValueHiReg);
}
- if (Opc == AMDGPU::S_SUB_I32)
+ if (Opc == AMDGPU::S_SUB_I32) {
BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedReducedVal)
.addImm(0)
.addReg(ReducedValSGPR);
+ } else if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
+ auto NegatedValInstr =
+ BuildMI(*CurrBB, MI, DL, TII->get(Opc), NegatedReducedVal)
+ .addImm(0)
+ .addReg(ReducedValSGPR);
+ CurrBB = Expand64BitScalarArithmetic(*NegatedValInstr, CurrBB);
+ }
// Mark the final result as a whole-wave-mode calculation.
BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::STRICT_WWM), DstReg)
- .addReg(Opc == AMDGPU::S_SUB_I32 ? NegatedReducedVal
- : ReducedValSGPR);
+ .addReg(Opc == AMDGPU::S_SUB_I32 || Opc == AMDGPU::S_SUB_U64_PSEUDO
+ ? NegatedReducedVal
+ : ReducedValSGPR);
RetBB = CurrBB;
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
index d7fa9bc800634..b78b580d3ed59 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
@@ -1009,6 +1009,943 @@ entry:
ret void
}
+define void @divergent_value_dpp_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT: s_nop 0
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8DAGISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX8DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_dpp_i64:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX8GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT: s_nop 0
+; GFX8GISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
+; GFX8GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX8GISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX8GISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX8GISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT: s_nop 0
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9DAGISEL-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
+; GFX9DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX9DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_dpp_i64:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX9GISEL-NEXT: v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX9GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT: v_mov_b...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/189225
More information about the llvm-branch-commits
mailing list