[llvm] [AMDGPU] Add CodeGen support for GFX12 s_mul_u64 (PR #75825)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 2 03:55:48 PST 2024
================
@@ -2094,6 +2094,121 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
return true;
}
+// Break s_mul_u64 into 32-bit vector operations.
+void AMDGPURegisterBankInfo::applyMappingSMULU64(
+ const OperandsMapper &OpdMapper) const {
+
+ MachineInstr &MI = OpdMapper.getMI();
+ MachineRegisterInfo &MRI = OpdMapper.getMRI();
+ Register DstReg = MI.getOperand(0).getReg();
+
+ // Insert basic copies.
+ applyDefaultMapping(OpdMapper);
+
+ Register SrcReg0 = MI.getOperand(1).getReg();
+ Register SrcReg1 = MI.getOperand(2).getReg();
+ assert(MRI.getRegBankOrNull(SrcReg0) == &AMDGPU::VGPRRegBank &&
+ MRI.getRegBankOrNull(SrcReg1) == &AMDGPU::VGPRRegBank &&
+ "Source operands should be in vector registers.");
+ MachineBasicBlock *MBB = MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+
+ // Extract subregisters from the first operand
+ Register NewSrcReg0 = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ MRI.setRegClass(NewSrcReg0, &AMDGPU::VReg_64RegClass);
+ BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), NewSrcReg0)
+ .addReg(SrcReg0, 0, MI.getOperand(1).getSubReg());
+ Register Op0L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MRI.setRegClass(Op0L, &AMDGPU::VGPR_32RegClass);
+ Register Op0H = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MRI.setRegClass(Op0H, &AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), Op0L)
+ .addReg(NewSrcReg0, 0, AMDGPU::sub0);
+ BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), Op0H)
+ .addReg(NewSrcReg0, 0, AMDGPU::sub1);
+
+ // Extract subregisters from the second operand.
+ Register NewSrcReg1 = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ MRI.setRegClass(NewSrcReg1, &AMDGPU::VReg_64RegClass);
+ BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), NewSrcReg1)
+ .addReg(SrcReg1, 0, MI.getOperand(2).getSubReg());
+ Register Op1L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MRI.setRegClass(Op1L, &AMDGPU::VGPR_32RegClass);
+ Register Op1H = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MRI.setRegClass(Op1H, &AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), Op1L)
+ .addReg(NewSrcReg1, 0, AMDGPU::sub0);
+ BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), Op1H)
+ .addReg(NewSrcReg1, 0, AMDGPU::sub1);
+
+ // Split s_mul_u64 in 32-bit multiplications.
+ Register NewDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ MRI.setRegClass(NewDestReg, &AMDGPU::VReg_64RegClass);
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MRI.setRegClass(DestSub0, &AMDGPU::VGPR_32RegClass);
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MRI.setRegClass(DestSub1, &AMDGPU::VGPR_32RegClass);
+
+ // The multiplication is done as follows:
+ //
+ // Op1H Op1L
+ // * Op0H Op0L
+ // --------------------
+ // Op1H*Op0L Op1L*Op0L
+ // + Op1H*Op0H Op1L*Op0H
+ // -----------------------------------------
+ // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
+ //
+ // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
+ // value and that would overflow.
+ // The low 32-bit value is Op1L*Op0L.
+ // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from
+ // Op1L*Op0L).
+
+ Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MRI.setRegClass(Op1L_Op0H_Reg, &AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
----------------
arsenm wrote:
Why go straight to the selected instructions? It would be less code to emit the generic instructions, and that wouldn't destroy further known bits knowledge
https://github.com/llvm/llvm-project/pull/75825
More information about the llvm-commits
mailing list